Import necessary packages and load data

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


features_file = 'AwA2-features/Animals_with_Attributes2/Features/ResNet101/AwA2-features.txt'
labels_file = 'AwA2-features/Animals_with_Attributes2/Features/ResNet101/AwA2-labels.txt'

# There is in total 37322 images of 50 classes. Each image is represented as a 2048 dimensional feature
features = np.loadtxt(features_file) # shape (37322, 2048)
labels = np.loadtxt(labels_file) # shape (37322, )

# Split each and all classes into training set (60%) and test set (40%)
# set random_state to an int for reproducibility
X_train, X_test, Y_train, Y_test = train_test_split(
    features, labels, train_size=0.6, test_size=0.4, random_state=0, stratify=labels)

Feature Selection
A 2048 dimensional (binary) mask (e.g. [0, 0, 1, 0, 1, 1, ..., 0, 0]) is applied to
the feature of each image. We want to find out one or several masks such that the 
resulting features (lower dimension) works well, presumably better than the original
2048 dimensional features.

We use an evolutionary approach. Namely, we initialize a population of masks (say 10,000
out of 2^2048 possibilities). Next we train a linear SVM for each of them and evaluate
the fitness of each mask with the mean accuracy. Pick the top 10% (1,000) masks and make
little changes to it (mutation) to enrich the whole population again back to 10,000, and
then iterate.

In [None]:
d = X_train.shape[1] # dimensionality of each image's feature (2048 in this case)
iter_num = 5 # number of iteration of evolutions
population_num = 100 # number of mask population
percentile = 0.1 # top percentile masks will be selected to mutate and generate masks for next iteration
c = 0.001 # Hyperparameter for linear SVM. Empirically selected from the previous experiment


# Initialize a population of masks
mask_population = np.random.randint(2, size=(population_num, d), dtype=bool)

# FUrther split training set into training and development set
X_train, X_dev, Y_train, Y_dev = train_test_split(
    X_train, Y_train, train_size=0.8, test_size=0.2, random_state=0, stratify=Y_train)


for it in range(iter_num):
    print('\n' + str(it+1) + 'th round:')
    print('mask population shape = ' + str(mask_population.shape) + '\n')
    cnt = 1

    # It stores all the masks and their corresponding fitness
    mask_fitness_list = []


    for mask in mask_population:
        # Apply current mask to X_train and X_dev
        masked_X_train = []
        masked_X_dev = []
        for feature in X_train:
            masked_X_train.append(feature[mask])
        for feature in X_dev:
            masked_X_dev.append(feature[mask])
        masked_X_train = np.array(masked_X_train)
        masked_X_dev = np.array(masked_X_dev)

        # Train a linear SVM with masked_X_Train
        clf = SVC(kernel='linear', C=c)
        clf.fit(masked_X_train, Y_train)
        fitness = clf.score(masked_X_dev, Y_dev)

        print('Mask ' + str(cnt) + ', # of active component = ' + str(np.sum(mask)) + '\nMean Accuracy = ' + str(fitness))
        cnt += 1

        mask_fitness_list.append((mask, fitness))

    # Pick the top percentile masks and their fitness score
    top_mask_fitness_list = sorted(mask_fitness_list, key=lambda x : x[1])[ : int(percentile * len(mask_fitness_list))]
    top_masks = [pair[0] for pair in top_mask_fitness_list]
    top_fitness = [pair[1] for pair in top_mask_fitness_list]

    print("\nMean performance of top 10 percent masks in this round: %0.2f (+/- %0.2f)" % (np.mean(top_fitness), np.std(top_fitness) * 2))

    # Mutate top masks a little to enrich the mask population back to 10000 again
    # For each mask in top_masks, choose 3 entries randomly and generate all 2^3=8 permutation
    # on these three entry to get 8 new masks, so we now have 8000 new masks
    # Besides, bring in another 2000 compeletely new masks generated randomly to make the whol
    # mask pool 10000 again
    mask_population = []
    for mask in top_masks:
        # Randomly pick three indices
        i, j, k = np.random.randint(d, size=3)
        for i_bool in [True, False]:
            for j_bool in [True, False]:
                for k_bool in [True, False]:
                    mask[i], mask[j], mask[k] = i_bool, j_bool, k_bool
                    mask_population.append(np.copy(mask))


    # Bring in some completely random masks to constitute the mask population for
    # the next iteration/evolution
    mask_population = np.array(mask_population)
    new_random_masks = np.random.randint(2, size=(population_num - len(mask_population), d), dtype=bool)
    mask_population = np.vstack((mask_population, new_random_masks))
