# Agnostic Feature Selection
The goal of this project is to replicate and extend the results obtained by the [Agnostic Feature Selection](https://www.ecmlpkdd2019.org/downloads/paper/744.pdf) paper.

In [None]:
# Test tensorflow-gpu installation
import tensorflow as tf
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

## Imports

In [None]:
import scipy.io
import numpy as np
from skfeature.function.sparse_learning_based import NDFS, MCFS
from skfeature.function.similarity_based import SPEC, lap_score
from skfeature.utility import construct_W, unsupervised_evaluation
from skfeature.utility.sparse_learning import feature_ranking
from sklearn.metrics import accuracy_score
from keras.layers import Input, Dense
from keras.models import Model
from keras import regularizers
from keras.callbacks import TensorBoard
import matplotlib.pyplot as plt

In [None]:
# To ignore warnings, especially DeprecationWarnings
import warnings
warnings.filterwarnings('ignore')

## Load data sets

In [None]:
# Name of the data set
NAME = "COIL20"

In [None]:
mat = scipy.io.loadmat('scikit-feature/skfeature/data/' + NAME + '.mat')
X = mat['X']
X = X.astype(float)
y = mat['Y']
y = y[:, 0]

In [None]:
# Estimate intristic dimension (ID)
data_ids = {"arcene":40, "Isolet":9, "ORL":6, "pixraw10P":4, "ProstateGE":23, "TOX171":15, "warpPie10P":3, "Yale":10}

## Experiments

### Laplacian

In [None]:
%%time
# Construct affinity matrix
kwargs_W = {"metric":"euclidean", "neighbor_mode":"knn", "weight_mode":"heat_kernel", "k":5, 't':1}
W = construct_W.construct_W(X, **kwargs_W)

# Obtain the scores of features
score = lap_score.lap_score(X, W=W)

# Sort the feature scores in an ascending order according to the feature scores
idx = lap_score.feature_ranking(score)

# Perform evaluation on clustering task
num_fea = 100       # Number of selected features
num_cluster = 20    # Number of clusters, it is usually set as the number of classes in the ground truth

# Obtain the dataset on the selected features
selected_features = X[:, idx[0:num_fea]]

# Perform kmeans clustering based on the selected features and repeats 20 times
nmi_total = 0
acc_total = 0
for i in range(20):
    nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
    nmi_total += nmi
    acc_total += acc

# Output the average NMI and average ACC
print('NMI:', float(nmi_total)/20)
print('ACC:', float(acc_total)/20)

### SPEC

In [None]:
%%time
# Specify the second ranking function which uses all except the 1st eigenvalue
kwargs = {'style': 0}

# Obtain the scores of features
score = SPEC.spec(X, **kwargs)

# Sort the feature scores in an descending order according to the feature scores
idx = SPEC.feature_ranking(score, **kwargs)

# Perform evaluation on clustering task
num_fea = 100       # Number of selected features
num_cluster = 20    # Number of clusters, it is usually set as the number of classes in the ground truth

# Obtain the dataset on the selected features
selected_features = X[:, idx[0:num_fea]]

# Perform kmeans clustering based on the selected features and repeats 20 times
nmi_total = 0
acc_total = 0
for i in range(20):
    nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
    nmi_total += nmi
    acc_total += acc

# Output the average NMI and average ACC
print('NMI:', float(nmi_total)/20)
print('ACC:', float(acc_total)/20)

### MCFS

In [None]:
%%time
# Construct affinity matrix
kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
W = construct_W.construct_W(X, **kwargs)

num_fea = 100       # Number of selected features
num_cluster = 20    # Number of clusters, it is usually set as the number of classes in the ground truth

# Obtain the feature weight matrix
Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20)

# Sort the feature scores in an ascending order according to the feature scores
idx = MCFS.feature_ranking(Weight)

# Obtain the dataset on the selected features
selected_features = X[:, idx[0:num_fea]]

# Perform kmeans clustering based on the selected features and repeats 20 times
nmi_total = 0
acc_total = 0
for i in range(20):
    nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
    nmi_total += nmi
    acc_total += acc

# Output the average NMI and average ACC
print('NMI:', float(nmi_total)/20)
print('ACC:', float(acc_total)/20)

### NDFS

In [None]:
%%time
# Construct affinity matrix
kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
W = construct_W.construct_W(X, **kwargs)

# Obtain the feature weight matrix
Weight = NDFS.ndfs(X, W=W, n_clusters=20)

# Sort the feature scores in an ascending order according to the feature scores
idx = feature_ranking(Weight)

# Perform evaluation on clustering task
num_fea = 100       # Number of selected features
num_cluster = 20    # Number of clusters, it is usually set as the number of classes in the ground truth

# Obtain the dataset on the selected features
selected_features = X[:, idx[0:num_fea]]

# Perform kmeans clustering based on the selected features and repeats 20 times
nmi_total = 0
acc_total = 0
for i in range(20):
    nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
    nmi_total += nmi
    acc_total += acc

# Output the average NMI and average ACC
print('NMI:', float(nmi_total)/20)
print('ACC:', float(acc_total)/20)

### AGNOS
Below is the [example](https://blog.keras.io/building-autoencoders-in-keras.html) from keras.

In [None]:
# this is the size of our encoded representations
encoding_dim = 32  # 32 floats -> compression of factor 24.5, assuming the input is 784 floats

# this is our input placeholder
input_img = Input(shape=(784,))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_img)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(784, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input_img, decoded)

In [None]:
# this model maps an input to its encoded representation
encoder = Model(input_img, encoded)

In [None]:
# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

In [None]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [None]:
from keras.datasets import mnist
(x_train, _), (x_test, _) = mnist.load_data()

In [None]:
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
print(x_train.shape)
print(x_test.shape)

In [None]:
autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(x_test, x_test))

In [None]:
# encode and decode some digits
# note that we take them from the *test* set
encoded_imgs = encoder.predict(x_test)
decoded_imgs = decoder.predict(encoded_imgs)

In [None]:
# use Matplotlib (don't ask)
import matplotlib.pyplot as plt

n = 10  # how many digits we will display
plt.figure(figsize=(20, 4))
for i in range(n):
    # display original
    ax = plt.subplot(2, n, i + 1)
    plt.imshow(x_test[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display reconstruction
    ax = plt.subplot(2, n, i + 1 + n)
    plt.imshow(decoded_imgs[i].reshape(28, 28))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

In [None]:
from keras import regularizers

encoding_dim = 32

input_img = Input(shape=(784,))
# add a Dense layer with a L1 activity regularizer
encoded = Dense(encoding_dim, activation='relu',
                activity_regularizer=regularizers.l1(10e-5))(input_img)
decoded = Dense(784, activation='sigmoid')(encoded)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
# RERUN THE 3 CELLS ABOVE