<a href="https://colab.research.google.com/github/Rocco000/OncoVision/blob/main/Scripts/GAApproach2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Link to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive') #Connect to Google Drive

Run the required scripts

In [None]:
from google.colab import auth
from googleapiclient.discovery import build

#To authenticate the user that run the script in order to use the correct path
auth.authenticate_user()
drive_service = build('drive', 'v3')

#Get user information
about = drive_service.about().get(fields='user').execute()
user_email = about['user']['emailAddress']
script_owner = False

if user_email =="rocco.iul2000@gmail.com":
  script_owner = True
  #Run the .ipynb file
  %run '/content/drive/MyDrive/Colab Notebooks/DatasetLoader.ipynb'
  %run '/content/drive/MyDrive/Colab Notebooks/ModelArchitecture2.ipynb'
  %run '/content/drive/MyDrive/Colab Notebooks/TrainModel.ipynb'
else:
  %run '/content/drive/MyDrive/LinkToOncoVision/DatasetLoader.ipynb'
  %run '/content/drive/MyDrive/LinkToOncoVision/ModelArchitecture2.ipynb'
  %run '/content/drive/MyDrive/LinkToOncoVision/TrainModel.ipynb'

# Genetic Algorithm (1° approach)
Each solution represents a cnn architecture (more detail in the paper):
> [ learning_rate,batch_size,num_epoch,optimizer,layer1,layer2,layer3,layer4,layer5,layer6,layer7,layer8,layer9,layer10,layer11,layer12,layer13,layer14 ]

Our **objective function**:
> max w * accuracy+(1-w) * recall where w=0.4

Our **valuation function** is equal to objective funtion.

# Fitness function
Our **fitness function** is:
> fitness(x) = f(x)/∑ f(j) where j ∈ P-{x} and P represents the population

We get the fitness value from a csv file

In [None]:
!pip install pygad
import pygad
import csv
import random
import numpy as np
import math

#This parameters are required by PyGAD
def fitness_function_calculator(ga_instance, solution, solution_idx):
  file_path = ""
  if script_owner:
    file_path = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"
  else:
    file_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"

  flag = False
  fitness_value = 0
  with open(file_path, "r", newline="") as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #Jump the first row (contains the file header)
    for row in reader:
      #Convert the values in the correct format
      learning_rate = float(row[0])
      batch_size = int(float(row[1]))
      num_epoch = int(float(row[2]))
      optimizer = int(float(row[3]))

      layer1 = int(float(row[4]))
      layer2 = int(float(row[5]))
      layer3 = int(float(row[6]))
      layer4 = int(float(row[7]))
      layer5 = int(float(row[8]))
      layer6 = int(float(row[9]))
      layer7 = int(float(row[10]))
      layer8 = int(float(row[11]))
      layer9 = int(float(row[12]))
      layer10 = int(float(row[13]))

      value = float(row[14])


      condition = learning_rate==solution[0] and batch_size==solution[1] and num_epoch==solution[2] and optimizer==solution[3] and layer1==solution[4] and layer2==solution[5] and layer3==solution[6] and layer4==solution[7] and layer5==solution[8] and layer6==solution[9] and layer7==solution[10] and layer8==solution[11] and layer9==solution[12] and layer10==solution[13]
      if condition:
        flag = True
        fitness_value = value
        break

  if flag:
    return fitness_value
  else:
    print("Solution not found!")
    return 0

# Initial population
We evaluate the initial population and store their fitness value in a csv file

In [None]:
import torch
def fun_on_start(ga_instance):
  population = ga_instance.population
  population_list = population.tolist()

  to_write = []
  max_fitness = 0
  path_parameters = ""
  if script_owner:
    path_parameters = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/model_parameters.pth"
  else:
    path_parameters = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/model_parameters.pth"

  for solution in population_list:
    #batch_size
    size = 0
    match solution[1]:
      case 1:
        size = 32
      case 2:
        size = 64
      case 3:
        size = 128
      case _:
        size = 32

    #num_epoch
    epoch = 0
    match solution[2]:
      case 1:
        epoch = 64
      case 2:
        epoch = 96
      case 3:
        epoch = 128
      case _:
        epoch = 64

    #Take the model architecture from the solution
    layers = solution[4:]

    #Train model
    best_model_configuration, acc, pre, rec, f1 = start_process(model_type=2, architecture=layers, bool_mlflow=False, learning_rate=solution[0], batch_size=size, num_epoch=epoch, opt=solution[3])
    fitness_value = (0.4*acc)+(0.6*rec)

    app = []
    app.extend(solution)
    app.append(fitness_value)
    app.append(acc)
    app.append(pre)
    app.append(rec)
    app.append(f1)
    to_write.append(app)

    #Store the model configuration of the best solution
    if fitness_value > max_fitness:
      torch.save(best_model_configuration, path_parameters)
      max_fitness = fitness_value

  #Store the solutions evaluations
  if script_owner:
    file_path1 = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"
    file_path2 = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/AllSolutions.csv"
  else:
    file_path1 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"
    file_path2 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/AllSolutions.csv"

  with open(file_path1, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","layer1","layer2","layer3","layer4","layer5","layer6","layer7","layer8","layer9","layer10", "fitness_value","accuracy","precision","recall","f1"])
    writer.writerows(to_write)

  with open(file_path2, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","layer1","layer2","layer3","layer4","layer5","layer6","layer7","layer8","layer9","layer10", "fitness_value","accuracy","precision","recall","f1"])
    writer.writerows(to_write)

# Tournament selection
At each tournament, we will select K=30 solutions

We will apply the tournament 20 times to obtain M=20 parents who will attend the crossover step.

In [None]:
def my_tournament_selection(fitness_values,required_number,ga_instance):
  population = ga_instance.population
  population_list = population.tolist()

  winners = []
  winners_index = []
  for i in range(required_number):

    selected_indices = np.random.choice(np.arange(len(population_list)), size=3, replace=False) #replace=False -> in this way we don't select the same individual more then one time

    selected_individuals = [] #to store the selected individuals
    selected_individuals_fitness = [] #to store the relative fitness value

    for index in selected_indices:
      selected_individuals.append(population_list[index])
      selected_individuals_fitness.append(fitness_values[index])

    #Select the tournement winner
    winner_value = np.amax(selected_individuals_fitness)

    #Find its position
    winner_position = selected_individuals_fitness.index(winner_value)

    winner = selected_individuals[winner_position]

    #Record the winner
    winners.append(winner)

    #Record its index in the population (required by PyGAD)
    winner_position_in_population = population_list.index(winner)
    winners_index.append(winner_position_in_population)

    #To delete the winner from the population we'll set its fitness value to -inf. In this way it can't win the next tournements
    fitness_values[winner_position_in_population] = -math.inf

  #Transform them in numpy array because it is required by PyGAD
  winners_numpy = np.array(winners)
  winners_index_numpy = np.array(winners_index)

  return winners_numpy, winners_index_numpy

# Assessment of solutions
**After the mutation step** we will evalutate the solutions and store their fitness value in a csv file.

If we obtain a solution that has the same configuration of another solution stored in "AllSolutions.csv", we do not retrain the model as there is a high probability that the solutions have the same performance (in this way we reduce the time consumption)

In [None]:
def fun_on_generation(ga_instance):
  print("***************************************************************")
  print("We are at the ",ga_instance.generations_completed," generation step")
  population = ga_instance.population
  population_list = population.tolist()

  to_write = [] #To store the solutions of the actual population
  to_add = [] #To store the new solutions in AllSolutions.csv

  #Check if this solution already exists. In this way we don't train this model (less time consumption)
  file_path = ""
  if script_owner:
    file_path = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/AllSolutions.csv"
  else:
    file_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/AllSolutions.csv"

  #To store the model configuration of the best solution
  max_fitness = 0
  path_parameters = ""
  if script_owner:
    path_parameters = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/model_parameters.pth"
  else:
    path_parameters = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/model_parameters.pth"

  for solution in population_list:

    flag = False
    to_store = None
    with open(file_path, "r", newline="") as csvfile:
      reader = csv.reader(csvfile)
      next(reader) #Jump the first row (contains the file header)
      for row in reader:
        #Convert the values in the correct format
        learning_rate = float(row[0])
        batch_size = int(float(row[1]))
        num_epoch = int(float(row[2]))
        optimizer = int(float(row[3]))

        layer1 = int(float(row[4]))
        layer2 = int(float(row[5]))
        layer3 = int(float(row[6]))
        layer4 = int(float(row[7]))
        layer5 = int(float(row[8]))
        layer6 = int(float(row[9]))
        layer7 = int(float(row[10]))
        layer8 = int(float(row[11]))
        layer9 = int(float(row[12]))
        layer10 = int(float(row[13]))

        value = float(row[14])
        acc = float(row[15])
        pre = float(row[16])
        rec = float(row[17])
        f1 = float(row[18])

        condition = learning_rate==solution[0] and batch_size==solution[1] and num_epoch==solution[2] and optimizer==solution[3] and layer1==solution[4] and layer2==solution[5] and layer3==solution[6] and layer4==solution[7] and layer5==solution[8] and layer6==solution[9] and layer7==solution[10] and layer8==solution[11] and layer9==solution[12] and layer10==solution[13]
        if condition:
          flag = True
          to_store = [learning_rate, batch_size, num_epoch, optimizer, layer1, layer2, layer3, layer4, layer5, layer6, layer7, layer8, layer9, layer10, value, acc, pre, rec, f1]
          if value > max_fitness:
            max_fitness = value
          break

    if flag:
      #We have already a same solution, therefore we store the configuration in ActualPopulation.csv
      to_write.append(to_store)
    else:
      #It's a new solution, therefore we must define and train a model

      #batch_size
      size = 0
      match solution[1]:
        case 1:
          size = 32
        case 2:
          size = 64
        case 3:
          size = 128
        case _:
          size = 32

      #num_epoch
      epoch = 0
      match solution[2]:
        case 1:
          epoch = 64
        case 2:
          epoch = 96
        case 3:
          epoch = 128
        case _:
          epoch = 64

      #Take the model architecture from the solution
      layers = solution[4:]

      #Train model
      best_model_configuration, acc, pre, rec, f1 = start_process(model_type=2, architecture=layers, bool_mlflow=False, learning_rate=solution[0], batch_size=size, num_epoch=epoch, opt=solution[3])
      fitness_value = (0.4*acc)+(0.6*rec)

      app = []
      app.extend(solution)
      app.append(fitness_value)
      app.append(acc)
      app.append(pre)
      app.append(rec)
      app.append(f1)

      #In this case we store the solution both in the AllSolutions and in the ActualPopulation (it's a new solution)
      to_write.append(app)
      to_add.append(app)

      #Store the model configuration of the best solution
      if fitness_value > max_fitness:
        torch.save(best_model_configuration, path_parameters)
        max_fitness = fitness_value

  file_path2 = ""
  file_path3 = ""
  if script_owner:
    file_path2 = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"
    file_path3 = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/Checkpoint.csv"
  else:
    file_path2 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"
    file_path3 = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/Checkpoint.csv"

  if len(to_add)>0:
    with open(file_path, "a", newline="") as csvfile:
      writer = csv.writer(csvfile)
      writer.writerows(to_add)

  with open(file_path2, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","layer1","layer2","layer3","layer4","layer5","layer6","layer7","layer8","layer9","layer10", "fitness_value","accuracy","precision","recall","f1"])
    writer.writerows(to_write)

  with open(file_path3, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["num_generation_step"])
    writer.writerow([ga_instance.generations_completed])


# Genetich Algorithm

In [None]:
import os
import matplotlib.pyplot as plt

#Define dataset
if script_owner:
  initialize_dataset("/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/")
else:
  initialize_dataset("/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/")

checkpoint_path = ""
if script_owner:
  checkpoint_path = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/Checkpoint.csv"
else:
  checkpoint_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/Checkpoint.csv"

max_num_generation = 2 #100
fitness = fitness_function_calculator
M = 2 #20 #number of parents to selection step
population_size = 3 #100
chromosomes_size = 14 #learning rate, batch size, num epoch, optimizer, 10 layers
# batch size: 1=32, 2=64, 3=128 ;
# num epoch: 1=64, 2=96, 3=128
# optimizer: 1=Adam, 2=Adadelta, 3=Nadam
# 1=Conv2d and out_channels = 32,
  # 2=Conv2d and out_channels = 16,
  # 3=Conv2d and out_channels = 8,
  # 4=Conv2d and out_channels = 4,
  # 5=MaxPool2d and kernel = 3,
  # 6=MaxPool2d and kernel = 2,
  # 7=AvgPool2d and kernel = 3,
  # 8=AvgPool2d and kernel = 2,
  # 9=Dropout2d,
  # 10=ReLu,
  # 11=LeakyReLu

genes_range = [{'low': 0.001, 'high': 0.1},
               {'low': 1, 'high': 3},
               {'low': 1, 'high': 3},
               {'low': 1, 'high': 3},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11},
               {'low': 1, 'high': 11}
               ]

k = 3 #30 #number of individuals that partecipate to the tournement
ga_instance = None

if os.path.exists(checkpoint_path):
  print("Started the GA from the last saved population")
  #This means that Google Colab stopped the Genetic Algorithm, therefore we restart the GA with an initial population equal to the last saved population
  evolution_step = 0
  with open(checkpoint_path, "r", newline="") as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #To jump the file header
    for row in reader:
      evolution_step = row[0]

  max_num_generation = max_num_generation - evolution_step

  population_path = ""
  if script_owner:
    population_path = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"
  else:
    population_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/ActualPopulation.csv"

  initial_population = []
  with open(population_path, "r", newline="") as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #To jump the file header
    for row in reader:
      learning_rate = float(row[0])
      batch_size = int(float(row[1]))
      num_epoch = int(float(row[2]))
      optimizer = int(float(row[3]))
      layer1 = int(float(row[4]))
      layer2 = int(float(row[5]))
      layer3 = int(float(row[6]))
      layer4 = int(float(row[7]))
      layer5 = int(float(row[8]))
      layer6 = int(float(row[9]))
      layer7 = int(float(row[10]))
      layer8 = int(float(row[11]))
      layer9 = int(float(row[12]))
      layer10 = int(float(row[13]))
      initial_population.append([learning_rate, batch_size, num_epoch, optimizer, layer1, layer2, layer3, layer4, layer5, layer6, layer7, layer8, layer9, layer10])

  initial_population = np.array(initial_population)
  ga_instance = pygad.GA(num_generations = max_num_generation,
                       num_parents_mating = M,
                       fitness_func = fitness,
                       initial_population = initial_population,
                       num_genes = chromosomes_size,
                       gene_type =[float, int, int, int, int, int, int, int, int, int, int, int, int, int],
                       gene_space = genes_range,
                       parent_selection_type = my_tournament_selection,
                       crossover_type = "single_point", #One-Point Crossover
                       mutation_type = "random", #Random Resetting -> set a random value within the range
                       mutation_by_replacement = True, #replace the gene by the new randomly generated value
                       mutation_probability = 0.5, #The probability that a gene must exceed in order to be modified
                       stop_criteria= "saturate_40", #Stop criteria: stop the GA if there isn't an improvement after 40 consecutive steps
                       on_generation = fun_on_generation,
                       save_solutions=True
                       )
else:
  ga_instance = pygad.GA(num_generations = max_num_generation,
                       num_parents_mating = M,
                       fitness_func = fitness,
                       sol_per_pop = population_size,
                       num_genes = chromosomes_size,
                       gene_type =[float, int, int, int, int, int, int, int, int, int, int, int, int, int],
                       gene_space = genes_range,
                       parent_selection_type = my_tournament_selection,
                       crossover_type = "single_point", #One-Point Crossover
                       mutation_type = "random", #Random Resetting -> set a random value within the range
                       mutation_by_replacement = True, #replace the gene by the new randomly generated value
                       mutation_probability = 0.5, #The probability that a gene must exceed in order to be modified
                       stop_criteria= "saturate_40", #Stop criteria: stop the GA if there isn't an improvement after 40 consecutive steps
                       on_start = fun_on_start,
                       on_generation = fun_on_generation,
                       save_solutions=True
                       )

print("***************************************************************")
print("Started Genetich Algorithm!")
ga_instance.run()
print("GA finished!")
print("***************************************************************")

#Save the best solution
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution: ", solution)
print("Fitness value of the best solution = ", solution_fitness)

file_path = ""
if script_owner:
  file_path = "/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/BestSolutionGA2.csv"
else:
  file_path = "/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/BestSolutionGA2.csv"

to_write = []
to_write.extend(solution)
to_write.append(solution_fitness)
with open(file_path, "w", newline="") as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(["learning_rate","batch_size","num_epoch","optimizer","layer1","layer2","layer3","layer4","layer5","layer6","layer7","layer8","layer9","layer10", "fitness_value"])
  writer.writerow(to_write)

#Plot the fitness values, genes, and the explored solutions
if script_owner:
  ga_instance.plot_fitness(save_dir='/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/PlotFitnessGA2Approach.png')
  ga_instance.plot_genes(save_dir='/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/PlotGenesGA2Approach.png')
  ga_instance.plot_new_solution_rate(save_dir='/content/drive/MyDrive/SE4AI/Model/EvaluationGASecondApproach/PlotExploredSolutionsGA2Approach.png')
else:
  ga_instance.plot_fitness(save_dir='/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/PlotFitnessGA2Approach.png')
  ga_instance.plot_genes(save_dir='/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/PlotGenesGA2Approach.png')
  ga_instance.plot_new_solution_rate(save_dir='/content/drive/MyDrive/LinkToOncoVision/SE4AI/Model/EvaluationGASecondApproach/PlotExploredSolutionsGA2Approach.png')