# Feature Selection Genetic Algorithm

The principle behind the genetic algorithm for feature selection is relatively simple, the feature space is represented by a binary array. Features encoded with a one means that they are present in the optimized feature set, while a zero means they have been removed.

In [None]:
import numpy as np

from ase.ga.data import DataConnection

from atoml.api.ase_data_setup import get_unique, get_train
from atoml.fingerprint.setup import FeatureGenerator
from atoml.regression import GaussianProcess
from atoml.preprocess.feature_engineering import single_transform
from atoml.ga import GeneticAlgorithm

In [None]:
# Connect ase atoms database.
gadb = DataConnection('../../data/gadb.db')

# Get all relaxed candidates from the db file.
all_cand = gadb.get_all_relaxed_candidates(use_extinct=False)

In [None]:
testset = get_unique(atoms=all_cand, size=100, key='raw_score')

trainset = get_train(atoms=all_cand, size=500, taken=testset['taken'],
                     key='raw_score')

trainval = trainset['target']
testval = testset['target']

In [None]:
generator = FeatureGenerator(atom_types=[78, 79], nprocs=1)
train_data = generator.return_vec(trainset['atoms'], [generator.eigenspectrum_vec])
test_data = generator.return_vec(testset['atoms'], [generator.eigenspectrum_vec])

train_data = single_transform(train_data)
test_data = single_transform(test_data)

In [None]:
kdict = {'k1':
         {
             'type': 'gaussian', 'width': 1., 'scaling': 1.,
             'dimension': 'single'
         }
         }
gp = GaussianProcess(train_fp=train_data,
                     train_target=trainval,
                     kernel_dict=kdict,
                     regularization=1e-2,
                     optimize_hyperparameters=True,
                     scale_data=True)

pred = gp.predict(test_fp=test_data, test_target=testval,
                  get_validation_error=True,
                  get_training_error=True)

score = pred['validation_error']['rmse_average']

print('all features: {0:.3f}'.format(score))

In [None]:
def fitf(x):
    """Define the fitness function for the GA."""
    bool_list = np.asarray(x, dtype=np.bool)
    new_train = train_data[:, bool_list]
    new_test = test_data[:, bool_list]

    kdict = {'k1':
             {
                 'type': 'gaussian', 'width': 1., 'scaling': 1.,
                 'dimension': 'single'
             }
             }
    gp = GaussianProcess(train_fp=new_train,
                         train_target=trainval,
                         kernel_dict=kdict,
                         regularization=1e-2,
                         optimize_hyperparameters=True,
                         scale_data=True)

    pred = gp.predict(test_fp=new_test, test_target=testval,
                      get_validation_error=True,
                      get_training_error=True)

    score = pred['validation_error']['rmse_average']

    print('error: {0:.3f} for {1} features and {2} eliminated'.format(
        score, len(x[x == 1]), len(x[x == 0])))

    return -score

In [None]:
ga = GeneticAlgorithm(pop_size=10,
                      fit_func=fitf,
                      dimension=np.shape(train_data)[1],
                      pop=None)

ga.search(20, verbose=True)