In [1]:
# loading the dataset 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd 
import math

In [2]:
def init_population(n , c):
    return np.array([[math.ceil(e) for e in pop] for pop in (np.random.rand(n,c)-0.5)]) , np.zeros((2,c))-1


def single_point_crossover(population):
    r, c, n = population.shape[0] , population.shape[1] , np.random.randint(1 , population.shape[1])
    
    for i in range(0,r,2):
        population[i] , population[i+1] = np.append(population[i][0:n] , population[i+1][n:c]) ,\
                np.append(population[i+1][0:n] , population[i][n:c])
            
    return population
    
def flip_mutation(population):
    return population.max() - population


def random_selection(population):
    r = population.shape[0]
    new_population = population.copy()
    
    for i in range(r):
        new_population[i] = population[np.random.randint(0,r)]
        
    return new_population

In [3]:
def get_fitness(data , feature_list , target , population):
    fitness = []
    
    for i in range(population.shape[0]):
        columns = [feature_list[j] for j in range(population.shape[1]) if population[i,j]==1]
        fitness.append(predictive_model(data[columns] , cancer.target))
        
    return fitness


def predictive_model(X , y):
    X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0)
    model = LogisticRegression(solver='liblinear' , max_iter=100, random_state=7)
    model.fit(X_train ,y_train )
    
    return accuracy_score(y_test , model.predict(X_test))

In [4]:
def memorize(pop, memory):
    return np.append(memory , pop.reshape(1,memory.shape[1]) , axis=0)

def replace_duplicate(population , memory):
    for i in range(population.shape[0]):
        counter = 0 
        while population.shape[1] in sum((memory == population[i]).astype(int).T) and counter<100:
            population[i] = np.array([math.ceil(k) for k in (np.random.rand(population.shape[1]) - 0.5)])
            counter += 1
            
        memory = memorize(population[i] , memory)
    return population , memory

In [5]:
def ga(data , feature_list , target , n , max_iter):
    c = len(feature_list)
    
    population , memory = init_population(n, c)
    population , memory = replace_duplicate(population , memory)
    
    fitness = get_fitness(data , feature_list , target , population)
    
    optimal_value = max(fitness)
    optimal_situation = population[np.where(fitness==optimal_value)][0]
    
    for i in range(max_iter):
        population = random_selection(population)
        population = single_point_crossover(population)
        
        if np.random.rand() < 0.3:
            population = flip_mutation(population)
            
        population , memory = replace_duplicate(population , memory)
        
        fitness = get_fitness(data , feature_list , target , population)
        
        if max(fitness)> optimal_value:
            optimal_value = max(fitness)
            optimal_solution = population[np.where(fitness == optimal_value)][0]
            
    return optimal_solution , optimal_value


In [6]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [7]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [8]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [9]:
df = pd.DataFrame(X , columns=[cancer.feature_names])

In [10]:
df['target'] = y

In [11]:
feature_list = cancer.feature_names

In [12]:
target = 'target'

In [13]:
feature_set , acc_score = ga(df , feature_list , target , 10, 100)

In [14]:
feature_set_names = [feature_list[i] for i in range(len(feature_list)) if feature_set[i]==1]

In [15]:
feature_set_names

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean compactness',
 'mean symmetry',
 'mean fractal dimension',
 'texture error',
 'smoothness error',
 'concave points error',
 'symmetry error',
 'worst radius',
 'worst perimeter',
 'worst symmetry']

In [16]:
acc_score

0.9736842105263158