# Feature Selection - Wrapper Approach using a Genetic Algorithm
In this notebook we implement a rather simple feature selection procedure that follows a wrapper approach. The search algorithm, Genetic algorithms in this case, is wrapped around the target classification/regression algorithm.

In [1]:
# install the evolutionary computation library
!pip install deap



In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn import datasets
from sklearn import linear_model
from sklearn import naive_bayes

from deap import algorithms
from deap import base
from deap import creator
from deap import tools

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
data = datasets.load_boston()

X = data["data"]
y = data["target"]

number_of_variables = X.shape[1]
input_variables = data.feature_names
target_variable = 'MEDV'

seed = 1234
np.random.seed(seed)

# let's create also a pandas data frame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MEDV'] = y
df.head()

kfolds = KFold(10, shuffle=True, random_state=seed)

In [4]:
def EvaluateFeatureSubsetSingleObjective(individual):
    selected_columns = []
    for i, allele in enumerate(individual):
        if (allele==1):
            selected_columns.append(df.columns[i])

    model = linear_model.LinearRegression()
    scores = cross_val_score(model, df[selected_columns], y, cv=kfolds)
    return scores.mean(),

## Simple Genetic Algorithm
If looks for the feature subset that maximizes the overall performance.

In [5]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
# creator.create("Individual", list, typecode='b', fitness=creator.FitnessMax)
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# Attribute generator
toolbox.register("attr_bool", random.randint, 0, 1)

# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, number_of_variables)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [6]:
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, number_of_variables)
toolbox.register("evaluate", EvaluateFeatureSubsetSingleObjective)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

In [7]:
pop = toolbox.population(n=100)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, stats=stats, halloffame=hof, verbose=True)

gen	nevals	avg   	std     	min     	max     
0  	100   	0.5295	0.111939	0.230525	0.673672
1  	58    	0.608515	0.0522335	0.311601	0.676593
2  	62    	0.639771	0.0246692	0.560079	0.679719
3  	76    	0.655745	0.0198035	0.571319	0.679957
4  	62    	0.665384	0.017312 	0.585463	0.683319
5  	62    	0.673607	0.0106756	0.604827	0.683319
6  	67    	0.677223	0.0106139	0.596015	0.686291
7  	47    	0.68042 	0.00520338	0.641073	0.686291
8  	62    	0.681441	0.00823372	0.61548 	0.686291
9  	55    	0.682398	0.00872825	0.625077	0.686291
10 	56    	0.684159	0.00669046	0.644112	0.686291
11 	66    	0.684537	0.00695231	0.642208	0.686291
12 	62    	0.684968	0.00670405	0.642208	0.686291
13 	65    	0.685668	0.00267743	0.669372	0.686291
14 	49    	0.683982	0.011128  	0.610402	0.686291
15 	56    	0.683664	0.0108812 	0.601928	0.686291
16 	69    	0.681449	0.0273114 	0.43426 	0.686291
17 	60    	0.680364	0.0289233 	0.417941	0.686291
18 	62    	0.682277	0.0154917 	0.587232	0.686291
19 	61    	0.683507	0.0128785 	0.5

## Multi-objective Version
It applies a multi-objective genetic algorithm that tries to maximize the performance while minimizing the number of features involved.

In [8]:
def EvaluateFeatureSubsetMultipleObjective(individual):
    '''returns the average performance and the number of features involved'''
    selected_columns = []
    for i, allele in enumerate(individual):
        if (allele==1):
            selected_columns.append(df.columns[i])

    if (len(selected_columns)>0):
        model = linear_model.LinearRegression()
        scores = cross_val_score(model, df[selected_columns], y, cv=kfolds)
        return scores.mean(), sum(individual)/float(len(individual))
    else:
        return 0, len(individual)

In [9]:
creator.create("FitnessMulti", base.Fitness, weights=(1.0, -1.0))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()
# Attribute generator
toolbox.register("attr_bool", random.randint, 0, 1)
# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, number_of_variables)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Operator registering
toolbox.register("evaluate", EvaluateFeatureSubsetMultipleObjective)
toolbox.register("mate", tools.cxUniform, indpb=0.1)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selNSGA2)



In [10]:
# random.seed(64)
MU, LAMBDA = 100, 200
pop = toolbox.population(n=MU)
hof = tools.ParetoFront()
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean, axis=0)
stats.register("std", np.std, axis=0)
stats.register("min", np.min, axis=0)
stats.register("max", np.max, axis=0)

pop, logbook = algorithms.eaMuPlusLambda(pop, toolbox, mu=MU, lambda_=LAMBDA,
                                         cxpb=0.7, mutpb=0.3, ngen=40, 
                                         stats=stats, halloffame=hof)

print("BEST "+str(hof[0]))


gen	nevals	avg                    	std                    	min                    	max                    
0  	100   	[0.50505749 0.48846154]	[0.13286397 0.15088496]	[0.09478814 0.07692308]	[0.67404805 0.84615385]
1  	200   	[0.54088189 0.45692308]	[0.15574829 0.21033082]	[0.09478814 0.07692308]	[0.6778203  0.84615385]
2  	200   	[0.5087206  0.36307692]	[0.18927323 0.2148565 ]	[0.09478814 0.07692308]	[0.6778203  0.84615385]
3  	200   	[0.45315159 0.32615385]	[0.23729778 0.24132769]	[0.09478814 0.07692308]	[0.6778203  0.84615385]
4  	200   	[0.44904449 0.33923077]	[0.25170748 0.25234873]	[0.12079571 0.07692308]	[0.67814381 0.92307692]
5  	200   	[0.57935314 0.39307692]	[0.14722082 0.21997714]	[0.19748379 0.07692308]	[0.67853904 0.92307692]
6  	200   	[0.63850749 0.42      ]	[0.07128686 0.18792545]	[0.23742923 0.07692308]	[0.68201157 1.        ]
7  	200   	[0.64021341 0.39      ]	[0.04919693 0.22669611]	[0.41057253 0.07692308]	[0.68201157 1.        ]
8  	200   	[0.65072675 0.39923077]	[0

## Discussion
Note that we applied genetic algorithms using the entire dataset for the evaluation of the feature subset. In a real scenario we should have initially split the data as train and test and then applied the genetic algorithm only using the training dataset.