In [117]:
import pandas
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.metrics import mean_squared_error

In [118]:
# Genetic algorithm taken from: https://towardsdatascience.com/hyperparameter-tuning-in-xgboost-using-genetic-algorithm-17bd2e581b17


In [129]:
def initilialize_poplulation(numberOfParents):
    learningRate = np.empty([numberOfParents, 1])
    nEstimators = np.empty([numberOfParents, 1], dtype = np.uint8)
    maxDepth = np.empty([numberOfParents, 1], dtype = np.uint8)
    minChildWeight = np.empty([numberOfParents, 1])
    gammaValue = np.empty([numberOfParents, 1])
    subSample = np.empty([numberOfParents, 1])
    colSampleByTree =  np.empty([numberOfParents, 1])
    for i in range(numberOfParents):
        print(i)
        learningRate[i] = round(random.uniform(0.001, 1), 2)
        nEstimators[i] = random.randrange(10, 1500, step = 25)
        maxDepth[i] = int(random.randrange(1, 10, step= 1))
        minChildWeight[i] = round(random.uniform(0.01, 10.0), 2)
        gammaValue[i] = round(random.uniform(0.01, 10.0), 2)
        subSample[i] = round(random.uniform(0.01, 1.0), 2)
        colSampleByTree[i] = round(random.uniform(0.01, 1.0), 2)
    
    population = np.concatenate((learningRate, nEstimators, maxDepth, minChildWeight, gammaValue, subSample, colSampleByTree), axis= 1)
    return population

In [152]:
def fitness(y_true, y_pred):
    fitness = round((1/mean_squared_error(y_true, y_pred)), 7)
    return fitness  # train the data and find fitness score; I modified fitness to 1/MSE -- therefore we still want the highest "fitness"

def train_population(population, TrainX, TrainY, TestX, TestY):
    fScore = []
    for i in range(population.shape[0]):
        param = { 'objective':'reg:squarederror',
              'learning_rate': population[i][0],
              'n_estimators': population[i][1], 
              'max_depth': int(population[i][2]), 
              'min_child_weight': population[i][3],
              'gamma': population[i][4], 
              'subsample': population[i][5],
              'colsample_bytree': population[i][6],
              'seed': 24} # Maybe add alpha at some point... 
        num_round = 100
        
        # Put data into dmatrix format
        xgDMatrix = xgb.DMatrix(TrainX, TrainY) #create Dmatrix
        xgbDMatrixTest = xgb.DMatrix(TestX, TestY)
        dMatrixtest = xgbDMatrixTest
        dMatrixTrain = xgDMatrix
        # This is where we need to put the regressor bit
        #xg_reg = xgb.XGBRegressor(**param, num_boost_round = num_round)
        #xg_reg.fit(TrainX, TrainY)
        #preds = xg_reg.predict(TestX)
        
        xgbT = xgb.train(param, dMatrixTrain, num_round)
        preds = xgbT.predict(dMatrixtest)
        #preds = preds>0.5 # I have no idea why this is here; it creates an array of "True" the size of preds; however preds is needed to calculate the fitness score
        #print('test y ', TestY, 'preds ', preds)
        fScore.append(fitness(TestY, preds))
        # I think maybe they're clearing preds, which shouldn't be needed... but I tried it down here
        preds = preds>0.5
    return fScore

In [121]:
# select parents for mating
def new_parents_selection(population, fitness, numParents):
    selectedParents = np.empty((numParents, population.shape[1])) #create an array to store fittest parents
    
    # find the top best performing parents
    for parentId in range(numParents):
        bestFitnessId = np.where(fitness == np.max(fitness)) 
        bestFitnessId  = bestFitnessId[0][0]
        selectedParents[parentId, :] = population[bestFitnessId, :]
        fitness[bestFitnessId] = -1  # set this value to negative, in case of F1-score, so this parent is not selected again; changed to nan
    return selectedParents

In [122]:
'''
Mate these parents to create children having parameters from these parents (we are using uniform crossover method)
'''

def crossover_uniform(parents, childrenSize):
    
    crossoverPointIndex = np.arange(0, np.uint8(childrenSize[1]), 1, dtype= np.uint8) #get all the index
    crossoverPointIndex1 = np.random.randint(0, np.uint8(childrenSize[1]), np.uint8(childrenSize[1]/2)) # select half  of the indexes randomly
    crossoverPointIndex2 = np.array(list(set(crossoverPointIndex) - set(crossoverPointIndex1))) #select leftover indexes
    
    children = np.empty(childrenSize)
    
    '''
    Create child by choosing parameters from two parents selected using new_parent_selection function. The parameter values
    will be picked from the indexes, which were randomly selected above. 
    '''
    for i in range(childrenSize[0]):
        
        # find parent 1 index 
        parent1_index = i%parents.shape[0]
        # find parent 2 index
        parent2_index = (i+1)%parents.shape[0]
        # insert parameters based on random selected indexes in parent 1
        children[i, crossoverPointIndex1] = parents[parent1_index, crossoverPointIndex1]
        # insert parameters based on random selected indexes in parent 1
        children[i, crossoverPointIndex2] = parents[parent2_index, crossoverPointIndex2]
    return children

In [153]:
def mutation(crossover, numberOfParameters):
    
    # Define minimum and maximum values allowed for each parameter
    minMaxValue = np.zeros((numberOfParameters, 2))
    
    minMaxValue[0,:] = [0.01, 1.0] #min/max learning rate
    minMaxValue[1, :] = [10, 2000] #min/max n_estimator
    minMaxValue[2, :] = [1, 15] #min/max depth
    minMaxValue[3, :] = [0, 10.0] #min/max child_weight
    minMaxValue[4, :] = [0.01, 10.0] #min/max gamma
    minMaxValue[5, :] = [0.01, 1.0] #min/maxsubsample
    minMaxValue[6, :] = [0.01, 1.0] #min/maxcolsample_bytree
 
    # Mutation changes a single gene in each offspring randomly.
    mutationValue = 0
    parameterSelect = np.random.randint(0, 7, 1)
    print(parameterSelect)
    if parameterSelect == 0: #learning_rate
        mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
    if parameterSelect == 1: #n_estimators
        mutationValue = np.random.randint(-200, 200, 1)
    if parameterSelect == 2: #max_depth
        mutationValue = np.random.randint(-5, 5, 1)
    if parameterSelect == 3: #min_child_weight
        mutationValue = round(np.random.uniform(5, 5), 2)
    if parameterSelect == 4: #gamma
        mutationValue = round(np.random.uniform(-2, 2), 2)
    if parameterSelect == 5: #subsample
        mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
    if parameterSelect == 6: #colsample
        mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
  
    # indtroduce mutation by changing one parameter, and set to max or min if it goes out of range
    for idx in range(crossover.shape[0]):
        crossover[idx, parameterSelect] = crossover[idx, parameterSelect] + mutationValue
        if(crossover[idx, parameterSelect] > minMaxValue[parameterSelect, 1]):
            crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 1]
        if(crossover[idx, parameterSelect] < minMaxValue[parameterSelect, 0]):
            crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 0] 
    return crossover

In [154]:
acceptable_trials = ['m07_t01_15', 'm07_t03_15', 'm07_t06_15','m10_t02_16','m11_t02_16','m11_t04_16',
                   'm12_t02_16','m14_t05_16', 'm14_t03_16', 'm15_t01_16', 'm15_t03_16']


In [127]:
# Create an empty dataframe to store the optimal hyper parameters determined by the genetic algorithm

HypParams= pandas.DataFrame({'Trial' : acceptable_trials})  
a = np.full((len(HypParams)), np.nan)
column_names = ['learning_rate', 'n_estimators', 'max_depth', 'min_child_weight', 'gamma', 'subsample','colsample_bytree']
for col in column_names: 
    HypParams[col] = a

HypParams


Unnamed: 0,Trial,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree
0,m07_t01_15,,,,,,,
1,m07_t03_15,,,,,,,
2,m07_t06_15,,,,,,,
3,m10_t02_16,,,,,,,
4,m11_t02_16,,,,,,,
5,m11_t04_16,,,,,,,
6,m12_t02_16,,,,,,,
7,m14_t05_16,,,,,,,
8,m14_t03_16,,,,,,,
9,m15_t01_16,,,,,,,


In [163]:
numberOfParents = 10 # number of parents to start
numberOfParentsMating = 4 # number of parents that will mate
numberOfParameters = 7 # number of parameters that will be optimized
numberOfGenerations = 7 # number of genration that will be created#define the population sizepopulationSize = (numberOfParents, numberOfParameters)#initialize the population with randomly generated parameters

for trial_ind in np.arange(0,len(acceptable_trials)):
    
    TrainX = pandas.read_csv('./MLFormattedData/Train/' + acceptable_trials[trial_ind] + '_TrainX.csv' )
    TrainY = pandas.read_csv('./MLFormattedData/Train/' + acceptable_trials[trial_ind] + '_TrainY.csv' )

    TestX = pandas.read_csv('./MLFormattedData/Test/' + acceptable_trials[trial_ind] + '_TestX.csv' )
    TestY = pandas.read_csv('./MLFormattedData/Test/' + acceptable_trials[trial_ind] + '_TestY.csv' )
    
    # For whatever reason, if we redefine this between trials, I think it will work fine. But if not, we get a list not callable error...?
    def fitness(y_true, y_pred):
        fitness = round((1/mean_squared_error(y_true, y_pred)), 7)
        return fitness  # train the data and find fitness score; I modified fitness to 1/MSE -- therefore we still want the highest "fitness"


    # define the population size
    populationSize = (numberOfParents, numberOfParameters)

    # initialize the population with randomly generated parameters
    population = initilialize_poplulation(numberOfParents)#define an array to store the fitness  hitory
    fitnessHistory = np.empty([numberOfGenerations+1, numberOfParents])#define an array to store the value of each parameter for each parent and generation
    populationHistory = np.empty([(numberOfGenerations+1)*numberOfParents, numberOfParameters])#insert the value of initial parameters in history
    populationHistory[0:numberOfParents, :] = population 

    for generation in range(numberOfGenerations):
        print("This is number %s generation" % (generation))

        #train the dataset and obtain fitness
        fitnessValue = train_population(population=population, TrainX = TrainX, TrainY = TrainY, TestX = TestX, TestY = TestY)

        fitnessHistory[generation, :] = fitnessValue

        #best score in the current iteration
        print('Best F1 score in the this iteration = {}'.format(np.max(fitnessHistory[generation, :])))#survival of the fittest - take the top parents, based on the fitness value and number of parents needed to be selected
        parents = new_parents_selection(population=population, fitness=fitnessValue, numParents=numberOfParentsMating)

        #mate these parents to create children having parameters from these parents (we are using uniform crossover)
        children = crossover_uniform(parents=parents, childrenSize=(populationSize[0] - parents.shape[0], numberOfParameters))

        #add mutation to create genetic diversity
        children_mutated = mutation(children, numberOfParameters)

        '''
        We will create new population, which will contain parents that where selected previously based on the
        fitness score and rest of them  will be children
        '''
        population[0:parents.shape[0], :] = parents #fittest parents
        population[parents.shape[0]:, :] = children_mutated #children

        populationHistory[(generation+1)*numberOfParents : (generation+1)*numberOfParents+ numberOfParents , :] = population #srore parent information
        
        
    #Best solution from the final iteration
    fitness = train_population(population=population, TrainX = TrainX, TrainY = TrainY, TestX = TestX, TestY =TestY)
    fitnessHistory[generation+1, :] = fitness # index of the best solution
    bestFitnessIndex = np.where(fitness == np.max(fitness))[0][0]#Best fitness
    print("Best fitness is =", fitness[bestFitnessIndex])#Best parameters
    print("Best parameters are:")
    print('learning_rate', population[bestFitnessIndex][0])
    print('n_estimators', population[bestFitnessIndex][1])
    print('max_depth', int(population[bestFitnessIndex][2])) 
    print('min_child_weight', population[bestFitnessIndex][3])
    print('gamma', population[bestFitnessIndex][4])
    print('subsample', population[bestFitnessIndex][5])
    print('colsample_bytree', population[bestFitnessIndex][6])
    
    for param_ind in np.arange(0,7):
        if param_ind == 2: 
            HypParams.at[trial_ind, column_names[param_ind]] = int(population[bestFitnessIndex][param_ind])
        else:
            HypParams.at[trial_ind, column_names[param_ind]] = population[bestFitnessIndex][param_ind]

0
1
2
3
4
5
6
7
This is number 0 generation
Best F1 score in the this iteration = 3.4479174
[1]
This is number 1 generation
Best F1 score in the this iteration = 3.4479174
[2]
This is number 2 generation
Best F1 score in the this iteration = 3.4479174
[0]
This is number 3 generation
Best F1 score in the this iteration = 3.4479174
[2]
This is number 4 generation
Best F1 score in the this iteration = 3.4479174
[6]
This is number 5 generation
Best F1 score in the this iteration = 3.4479174
[2]
Best fitness is = 3.6294539
Best parameters are:
learning_rate 0.18
n_estimators 173.0
max_depth 4
min_child_weight 9.24
gamma 5.66
subsample 0.53
colsample_bytree 0.93
0
1
2
3
4
5
6
7
This is number 0 generation
Best F1 score in the this iteration = 0.8943575
[1]
This is number 1 generation
Best F1 score in the this iteration = 0.8943575
[5]
This is number 2 generation
Best F1 score in the this iteration = 0.909312
[4]
This is number 3 generation
Best F1 score in the this iteration = 0.9261362
[6]


In [164]:
HypParams.to_csv('GeneticAlgorithmParameters_Extrapolation.csv')

In [165]:
HypParams

Unnamed: 0,Trial,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree
0,m07_t01_15,0.18,173.0,4.0,9.24,5.66,0.53,0.93
1,m07_t03_15,0.57,85.0,9.0,0.57,6.91,0.95,0.68
2,m07_t06_15,0.18,235.0,1.0,4.56,10.0,0.48,0.6
3,m10_t02_16,0.09,129.0,6.0,4.42,3.88,0.66,0.56
4,m11_t02_16,0.13,320.0,7.0,0.71,5.61,0.18,0.52
5,m11_t04_16,0.04,196.0,8.0,10.0,5.99,0.99,0.83
6,m12_t02_16,0.08,180.0,2.0,3.07,4.51,0.49,0.03
7,m14_t05_16,0.04,142.0,8.0,5.51,0.98,0.73,0.88
8,m14_t03_16,0.42,123.0,12.0,4.87,8.09,0.41,0.42
9,m15_t01_16,0.05,61.0,6.0,2.14,7.84,0.44,0.53
