<a href="https://colab.research.google.com/github/Rocco000/OncoVision/blob/main/Scripts/GAapproach1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Link to Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive') #Connect to Google Drive

Mounted at /content/drive


Run the required scripts

In [None]:
from google.colab import auth
from googleapiclient.discovery import build

#To authenticate the user that run the script in order to use the correct path
auth.authenticate_user()
drive_service = build('drive', 'v3')

#Get user information
about = drive_service.about().get(fields='user').execute()
user_email = about['user']['emailAddress']
script_owner = False

if user_email =="rocco.iul2000@gmail.com":
  script_owner = True
  #Run the .ipynb file
  %run '/content/drive/MyDrive/Colab Notebooks/DatasetLoader.ipynb'
  %run '/content/drive/MyDrive/Colab Notebooks/ModelArchitecture1.ipynb'
  %run '/content/drive/MyDrive/Colab Notebooks/TrainModel.ipynb'
else:
  %run '/content/drive/MyDrive/LinkToOncoVision/DatasetLoader.ipynb'
  %run '/content/drive/MyDrive/LinkToOncoVision/ModelArchitecture1.ipynb'
  %run '/content/drive/MyDrive/LinkToOncoVision/TrainModel.ipynb'

# Genetic Algorithm (1° approach)
Our solutions has the following configuration (more detail in code below):
> [ learning_rate, batch_size, num_epoch, optimizer_type ]

Our **objective function**:
> max w * accuracy+(1-w) * recall where w=0.4

We convert our problem in minimization problem, therefore:
> min -w * accuracy-(1-w) * recall

Our **valuation function** is equal to objective funtion.

# Fitness function
Our **fitness function** is:
> fitness(x) = f(x)/∑ f(j) where j ∈ P-{x} and P represents the population

We get the fitness value from a csv file

In [None]:
!pip install pygad
import pygad
import csv
import random
import numpy as np
import math
import pandas as pd

#This parameters are required by PyGAD
def fitness_function_calculator(ga_instance, solution, solution_idx):
  file_path = ""
  if script_owner:
    file_path = "/content/drive/MyDrive/SE4AI/ActualPopulation.csv"
  else:
    file_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/ActualPopulation.csv"

  flag = False
  fitness_value = 0
  with open(file_path, "r", newline="") as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #Jump the first row (contains the file header)
    for row in reader:
      row_app = [float(row[0]),int(row[1]),int(row[2]),int(row[3]),float(row[4])] #Convert the values in the correct format

      if row_app[0]==solution[0] and row_app[1]==solution[1] and row_app[2]==solution[2] and row_app[3]==solution[3]:
        flag = True
        fitness_value = row_app[4]
        break

  if flag:
    return fitness_value
  else:
    print("Row not found!")
    return 0

# Initial population
We evaluate the initial population and store their fitness value in a csv file

In [4]:
def fun_on_start(ga_instance):
  population = ga_instance.population
  population_list = population.tolist()

  to_write = []
  for solution in population_list:
    #batch_size
    size = 0
    match solution[1]:
      case 1:
        size = 32
      case 2:
        size = 64
      case 3:
        size = 128
      case _:
        size = 32

    #num_epoch
    epoch = 0
    match solution[2]:
      case 1:
        epoch = 64
      case 2:
        epoch = 96
      case 3:
        epoch = 128
      case _:
        epoch = 64

    #Train the model
    acc, pre, rec, f1 = start_process(bool_mlflow=False, learning_rate=solution[0], batch_size=size, num_epoch=epoch, opt=solution[3])
    fitness_value = (0.4*acc)+(0.6*rec)
    app = [solution[0],solution[1],solution[2],solution[3],fitness_value,acc,pre,rec,f1]
    to_write.append(app)

  #Store the solutions evaluations
  if script_owner:
    file_path1 = "/content/drive/MyDrive/SE4AI/ActualPopulation.csv"
    file_path2 = "/content/drive/MyDrive/SE4AI/AllSolutions.csv"
  else:
    file_path1 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/ActualPopulation.csv"
    file_path2 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/AllSolutions.csv"

  with open(file_path1, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","fitness_value","accuracy","precision","recall","f1"])
    writer.writerows(to_write)

  with open(file_path2, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","fitness_value","accuracy","precision","recall","f1"])
    writer.writerows(to_write)

# Tournament selection
At each tournament, we will select K=30 solutions

We will apply the tournament 20 times to obtain M=20 parents who will attend the crossover step.

In [5]:
def my_tournament_selection(fitness_values,required_number,ga_instance):
  population = ga_instance.population
  population_list = population.tolist()

  winners = []
  winners_index = []
  for i in range(required_number):

    selected_indices = np.random.choice(np.arange(len(population_list)), size=3, replace=False) #replace=False -> in this way we don't select the same individual more then one time

    selected_individuals = [] #to store the selected individuals
    selected_individuals_fitness = [] #to store the relative fitness value

    for index in selected_indices:
      selected_individuals.append(population_list[index])
      selected_individuals_fitness.append(fitness_values[index])

    #Select the tournement winner
    winner_value = np.amax(selected_individuals_fitness)

    #Find its position
    winner_position = selected_individuals_fitness.index(winner_value)

    winner = selected_individuals[winner_position]

    #Record the winner
    winners.append(winner)

    #Record its index in the population (required by PyGAD)
    winner_position_in_population = population_list.index(winner)
    winners_index.append(winner_position_in_population)

    #To delete the winner from the population we'll set its fitness value to -inf. In this way it can't win the next tournements
    fitness_values[winner_position_in_population] = -math.inf

  #Transform them in numpy array because it is required by PyGAD
  winners_numpy = np.array(winners)
  winners_index_numpy = np.array(winners_index)

  return winners_numpy, winners_index_numpy

# Assessment of solutions
**After the mutation step** we will evalutate the solutions and store their fitness value in a csv file.

If we obtain a solution that has the same configuration of another solution stored in "AllSolutions.csv", we do not retrain the model as there is a high probability that the solutions have the same performance (in this way we reduce the time consumption)

In [6]:
def fun_on_generation(ga_instance):
  population = ga_instance.population
  population_list = population.tolist()

  to_write = [] #To store the solutions of the actual population
  to_add = [] #To store the new solutions in AllSolutions.csv

  #Check if this solution already exists. In this way we don't train this model (less time consumption)
  file_path = ""
  if script_owner:
    file_path = "/content/drive/MyDrive/SE4AI/AllSolutions.csv"
  else:
    file_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/AllSolutions.csv"

  for solution in population_list:

    flag = False
    to_store = None
    with open(file_path, "r", newline="") as csvfile:
      reader = csv.reader(csvfile)
      next(reader) #Jump the first row (contains the file header)
      for row in reader:
        row_app = [float(row[0]),int(row[1]),int(row[2]),int(row[3]),float(row[4])] #Convert the values in the correct format

        if row_app[0]==solution[0] and row_app[1]==solution[1] and row_app[2]==solution[2] and row_app[3]==solution[3]:
          flag = True
          to_store = row_app
          break

    if flag:
      #We have already a same solution, therefore we store the configuration in ActualPopulation.csv
      to_write.append(to_store)
    else:
      #It's a new solution, therefore we must define and train a model

      #batch_size
      size = 0
      match solution[1]:
        case 1:
          size = 32
        case 2:
          size = 64
        case 3:
          size = 128
        case _:
          size = 32

      #num_epoch
      epoch = 0
      match solution[2]:
        case 1:
          epoch = 64
        case 2:
          epoch = 96
        case 3:
          epoch = 128
        case _:
          epoch = 64

      acc, pre, rec, f1 = start_process(bool_mlflow=False, learning_rate=solution[0], batch_size=size, num_epoch=epoch, opt=solution[3])
      fitness_value = (0.4*acc)+(0.6*rec)
      app = [solution[0],solution[1],solution[2],solution[3],fitness_value,acc,pre,rec,f1]

      #In this case we store the solution both in the AllSolutions and in the ActualPopulation (it's a new solution)
      to_write.append(app)
      to_add.append(app)

  file_path2 = ""
  if script_owner:
    file_path2 = "/content/drive/MyDrive/SE4AI/ActualPopulation.csv"
  else:
    file_path2 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/ActualPopulation.csv"

  with open(file_path, "a", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(to_add)

  with open(file_path2, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","fitness_value","accuracy","precision","recall","f1"])
    writer.writerows(to_write)


# Genetich Algorithm

In [None]:
#Define and train out model
print("Train our model without the GA (to check its performance)")
start_process(bool_mlflow=True, learning_rate=0.001, batch_size=64, num_epoch=64, opt=1)


max_num_generation = 3
fitness = fitness_function_calculator
M = 2 #20 #number of parents to selection step
fitness_func = fitness_function_calculator
population_size = 3 #100
chromosomes_size = 4 #learning rate, batch size, num epoch, optimizer
# batch size: 1=32, 2=64, 3=128 ;
# num epoch: 1=64, 2=96, 3=128
# optimizer: 1=Adam, 2=Adadelta, 3=Nadam

genes_range = [{'low': 0.001, 'high': 0.1}, {'low': 1, 'high': 3}, {'low': 1, 'high': 3}, {'low': 1, 'high': 3}]

k = 3 #30 #number of individuals that partecipate to the tournement


ga_instance = pygad.GA(num_generations = max_num_generation,
                       num_parents_mating = M,
                       fitness_func = fitness,
                       sol_per_pop = population_size,
                       num_genes = chromosomes_size,
                       gene_type =[float, int, int, int],
                       gene_space = genes_range,
                       parent_selection_type = my_tournament_selection,
                       crossover_type = "single_point", #One-Point Crossover
                       mutation_type = "random", #Random Resetting -> set a random value within the range
                       mutation_by_replacement = True, #replace the gene by the new randomly generated value
                       mutation_probability = 0.5, #The probability that a gene must exceed in order to be modified
                       stop_criteria= "saturate_40", #Stop criteria: stop the GA if there isn't an improvement after 40 consecutive steps
                       on_start = fun_on_start,
                       on_generation = fun_on_generation
                       )
ga_instance.run()

solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution: ", solution)
print("Fitness value of the best solution = ", solution_fitness)

file_path = ""
if script_owner:
  file_path = "/content/drive/MyDrive/SE4AI/Model/EvaluationGAFirstApproach/BestSolutionGA1.csv"
else:
  file_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/EvaluationGAFirstApproach/BestSolutionGA1.csv"

to_write = []
to_write.extend(solution)
to_write.append(solution_fitness)
with open(file_path, "w", newline="") as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","fitness_value"])
  writer.writerow(to_write)