In [None]:
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
# data load-in 

In [None]:
########################## hyper-parameters ##########################
iterations = 50 # number of iterations
pop_size = 100   # size of population (i.e. how many  chromosomes)
pc = 0.4   # probability of crossing
pm = 0.1   # probability of mutation
chrom_length = 8    # length of a chromosome
columns = list(X_train.columns)
######################################################################

In [None]:
# generate the initial population
def geneEncoding():
    i = 0
    while i < pop_size:
        temp = []
        has_1 = False  
        for j in range(chrom_length):
            rand = random.randint(0,1)
            if rand == 1:
                has_1 = True
            temp.append(rand)
        if has_1:   # ignore all-zeros
            i += 1
            pop.append(temp)

In [None]:
# calculate fitness of every chromosome
def calFitness():
    fitness_list.clear()
    for i in range(pop_size):  
        
        X_sub = X_train  # create a copy of the training X

        has_1 = False
        for j in range(chrom_length):
            if pop[i][j] == 0:
                col_name = columns[j]
                X_sub =X_sub.drop(columns = col_name)
            else:
                has_1 = True
        
        if has_1:
            clf = LogisticRegression(penalty='none') # logistic regression as basic model
            fitness = cross_val_score(clf, X_sub, y_train, cv=10, 
                                      scoring='roc_auc').mean() # 10-fold cv AUC as fitness
            fitness_list.append(fitness)
        else:
            fitness = 0     # 0 fitness for all-zeros
            fitness_list.append(fitness)

In [None]:
# roulette wheel selection （higher fitness, higher proba of being chosen）
def sumFitness():
    total = 0
    for i in range(pop_size):
        total += fitness_list[i]
    return total

def getRatio():
    ratio_list.clear()
    ratio_list.append(fitness_list[0])
    for i in range(1, pop_size):
        ratio_list.append(ratio_list[i-1] + fitness_list[i])
    ratio_list[-1] = 1

def selection():
    global pop
    total_fitness = sumFitness()
    for i in range(pop_size):
        fitness_list[i] = fitness_list[i] / total_fitness
    getRatio()
    
    rand_ratio = [] 
    for i in range(pop_size):
        rand_ratio.append(random.random())
    rand_ratio.sort()

    new_pop = []   
    i = 0  
    j = 0  
   
    while i < pop_size:
        if rand_ratio[i] < ratio_list[j]:  
            new_pop.append(pop[j])
            i += 1
        else:
            j += 1

    pop = new_pop

In [None]:
# crossover
def crossover():
    for i in range(pop_size-1): # crossover between adjoin chromosomes
        if random.random() < pc:
            cpoint = random.randint(0, chrom_length-1)    # randomly select a crossover point
            temp1 = []
            temp2 = []
            temp1.extend(pop[i][:cpoint])
            temp1.extend(pop[i+1][cpoint:])
            temp2.extend(pop[i+1][:cpoint])
            temp2.extend(pop[i][cpoint:])
            pop[i] = temp1
            pop[i+1] = temp2

In [None]:
# mutation
def mutation():
    for i in range(pop_size):
        if random.random() < pm: 
            mpoint = random.randint(0, chrom_length-1)  # randomly select a mutation point
            if pop[i][mpoint] == 1:
                pop[i][mpoint] = 0
            else:
                pop[i][mpoint] = 1

In [None]:
# get the one with highest fitness 
def getBest():
    best_chrom = pop[0]
    best_fitness = fitness_list[0]
    for i in range(1,pop_size):
        if fitness_list[i] > best_fitness:
            best_fitness = fitness_list[i]  
            best_chrom = pop[i] 

    return best_chrom, best_fitness

In [None]:
pop = []
fitness_list = []
ratio_list = []  

result_GA = []
GA_best = []
score_best = 0

geneEncoding()
    
for i in range(iterations):
    print(i)
        
    calFitness() 
        
    best_chrom, best_fitness = getBest()
    result_GA.append([i, best_chrom, best_fitness])
    if best_fitness > score_best: # keep the best variable subset along with its fitness
        GA_best = best_chrom
        score_best = best_fitness
    print(i,best_chrom,best_fitness)
    print('best',GA_best,score_best)
    print('\n')
        
    selection() 
    crossover() 
    mutation()  