In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = '/evoprompt_cifar'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))


%cd /content/drive/My\ Drive/$FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive/cmsc733/evoprompt_cifar


In [None]:
!pip install openai==0.28

import concurrent.futures
import json
import os
import random
import tensorflow as tf
import math
import numpy as np
import openai
# import torch
# from torch.utils.data import DataLoader,Dataset
# from torchvision import datasets, transforms

#Enter your api key
openai.api_key = ''
# pip install --upgrade openai


Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [None]:
import pickle
from sklearn.model_selection import train_test_split

class EvoPrompting:
    def __init__(self, lm, task, seed_folder, T, m, k, n, p, alpha,
                 n_evaluations, target_model_size, target_accuracy, seed_evaluation=False, evaluation_path=None):
        self.seed_folder = seed_folder # Folder where the seed codes are located
        self.seed_evaluation = seed_evaluation # Do we have to evaluate the seed codes?
        self.pre_evaluated_seed_metrics = self.load_pre_evaluated_seed_metrics(evaluation_path) # Pre evaluated seed metrics
        self.lm = lm # the crossover LM
        self.temperatures = [0.2, 0.6, 0.8, 1.0] # uniformly sample from these temperaturs
        # self.environment = environment # In our case CartPole-v1
        self.T = T # Number of rounds
        self.m = m # number of few-shot prompts per round
        self.n = n # number of samples to generate per prompt,
        self.k = k # number of in-context examples per prompt
        self.p = p # number of survivors to select per generation
        self.n_evaluations = n_evaluations # Number of times to run each model
        self.alpha = alpha # the upper threshold for the test error
        self.global_population = [] # Global historical Population

        self.target_model_size = target_model_size # Target model size of the few shot prompt
        self.target_accuracy = target_accuracy # Target number of episodes of the few shot prompt

        # Set initial well designed architectures as parent models.
        # (Evaluate them useing the same eval function as used in the aalgo)
        self.current_population = []
        self.initialize_population()


    def read_seed_files(self, file_path):
        with open(file_path, "r") as file:
            return file.read()


    def load_pre_evaluated_seed_metrics(self, file_path):
        with open(file_path, "r") as file:
            return json.load(file)


    def initialize_population(self):
        # Initialize the population with seed architectures
        # List all the Python files in the seed folder
        seed_files = [f for f in os.listdir(self.seed_folder) if f.endswith('.py')]

        for seed_file in seed_files:
            # print("EVALUATING SEED: ", seed_file)
            seed_file_path = os.path.join(self.seed_folder, seed_file)
            seed_code = self.read_seed_files(seed_file_path)
            # print(seed_code.type())
            # seed_code = np.array([seed_code1[0],seed_code1[1]])

            if self.seed_evaluation:
                accuracy, model_size = self.eval_t(seed_code)
            else:
                json= self.pre_evaluated_seed_metrics[seed_file]
                # convert string to float
                accuracy = float(json["accuracy"])
                model_size = float(json["model_size"])


            if(accuracy==0):
              print("ERROR IN SEED")
              continue
            else :
              # print("EVALUATED SEED: ", seed_file, "accuracy: ", accuracy, "model_size: ", model_size)
              metrics = {
                  "accuracy": accuracy,
                  "model_size": model_size,
              }

              fitness_score = accuracy * model_size
              self.global_population.append((seed_code, metrics, fitness_score))
              self.current_population.append((seed_code, metrics, fitness_score))


    def make_few_shot_prompt(self, in_context_examples):
        # Create a few-shot prompt using the in context examples E
        min_accuracy = float('inf')
        min_model_size = float('inf')
        prompt = "Given below are a few seeds for architecture to run on Cifar10 dataset. \
        Your job is to generate a new different and improved child architecture based on the seed architectures given.\
        Also keep in mind the dimensions of Cifar10 data while designing layers for the neural net. \
        Use the same function 'train_part34' from 'utils' to train the model the same way as shown in the seed.\
        Return accuracy & no.of parameters. You can experiment with hyperparameters as well. Your response\
        should be only text that can be executed by python interpreter and nothing else. Don't include any text like\
        ``` python etc" # Initialize empty prompt string

        for example in in_context_examples:
            metrics = example[1]
            min_accuracy = min(min_accuracy, metrics['accuracy']) # Retrieve the minium avg episodes of the parent architectures
            min_model_size = min(min_model_size, metrics['model_size']) # Retrieve the minium model size of the parent architectures
            prompt += f'\nMetrics : {example[1]}\n\n'
            prompt += f'\nCode : {example[0]}\n\n'

        target_accuracy = min_accuracy * self.target_accuracy
        target_model_size = min_model_size * self.target_model_size

        prompt+= f'\nTarget Accuracy : {target_accuracy}\n\n'
        prompt+= f'\ntarget_model_size : {target_model_size}\n\n'
        prompt += f'Code:\n'
        # print(prompt)
        return prompt


    def generate_child (self, prompt):
        child_code = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{'role':'user','content': f'{prompt}'}],
            temperature=np.random.choice(self.temperatures, size=1, replace=True).item()
            # n=1
            # max_tokens = 1000

        )
        # print("child code=\n\n ", child_code.choices[0].message['content'])
        return child_code.choices[0].message['content']


    def eval_t(self, code_segment):

        # To execute 'main' from seed
        def single_evaluation():
            print("Executing code segment")
            # print(code_segment)
            # print(globals()['main'](self.environment))

            try:
              exec(code_segment, globals())  # Add globals() here
              accuracy, model_size = main()
              print(f"Finished executing code segment: accuracy={accuracy*100}, model_size={model_size}")
              return accuracy*100, model_size

            except Exception as e:
              print(f"An error occurred: Moving to the next iteration", e)
              return None,0


        sum_accuracy = 0
        with concurrent.futures.ThreadPoolExecutor() as executor:
            print("Submitting tasks to the thread pool")
            # count = 1
            futures = [executor.submit(single_evaluation) for _ in range(self.n_evaluations)]
            for future in concurrent.futures.as_completed(futures):
                accuracy, model_size = future.result()
                if (accuracy):
                  sum_accuracy += accuracy
                  # count+=1

        avg_accuracy = sum_accuracy / self.n_evaluations
        print(f"Average accuracy: {avg_accuracy}, Model size: {model_size}")
        return avg_accuracy, model_size


    def get_top(self, global_population):
        """
        Returns the top entries from the global_population based on their fitness scores.

        This function takes a list of global_population entries, where each entry is a tuple containing:
        (code, metadata, fitness_score). It sorts the entries based on their fitness scores in descending
        order and returns the top num_top entries.

        Parameters:
        global_population (list): A list of tuples, where each tuple represents an entry in the global
                                population, containing (code, metadata, fitness_score).
        num_top (int, optional): The number of top entries to return. Defaults to 5.

        Returns:
        list: A list containing the top num_top entries from the global_population based on their fitness
            scores.
        """
        sorted_population = sorted(global_population, key=lambda x: x[2], reverse=True)
        top_entries = sorted_population[:self.p]
        return top_entries


    def cross_mutation(self):
        child_architectures = [] # C is the set of architectures of length k
        for _ in range(self.m): # create m number of few shot prompts
            in_context_examples = random.sample(self.current_population, self.k) # Pick k amount of parents from P

            # in_context_examples = random.sample(self.global_population, self.k)

            prompt = self.make_few_shot_prompt(in_context_examples)
            Ci = [self.generate_child(prompt) for _ in range(self.n)]
            child_architectures.extend(Ci)
        return child_architectures


    def fitness_function(self, model_size, accuracy):
        if(model_size):
          return model_size * accuracy
        else :
          return 0


    def filter_and_eval(self, child_architectures, alpha):
        CEVALED = []
        for code_segment in child_architectures:
            print("Evaluating child : \n", code_segment)
            avg_accuracy, model_size = self.eval_t(code_segment)
            if avg_accuracy!=0 :
              if avg_accuracy < alpha : # filter out the bad models
                  metrics = {
                      "accuracy": avg_accuracy,
                      "model_size": model_size,
                  }
                  fitness_score = self.fitness_function(model_size, avg_accuracy)
                  CEVALED.append((code_segment, metrics, fitness_score))
            else :
              print("Error in child code generated by the GPT")
        return CEVALED


    def train(self, CEVALED):
        # The original author of the paper proposes a soft prompt tune method here
        # I need a model here that can be soft promt tuned, probably gpt2 on huggingface.
        pass

    def evolve(self):
        t = 0
        while t < self.T: # number of evoluationary rounds
            child_architectures = self.cross_mutation() # Generate the set of code samples
            # print("Evaluating the following child architectures : ")
            # print(child_architectures)
            evaluated_children = self.filter_and_eval(child_architectures, self.alpha)
            if len(evaluated_children) > 0:
              self.global_population.extend(evaluated_children)

            if t < self.T - 1:
                self.current_population = self.get_top(global_population=self.global_population)
                #run without training
                #self.lm = self.train(self.lm, [c for c, _ in evaluated_children if c not in self.current_population])

            t += 1

        return self.get_top(global_population=self.global_population)




if __name__ == "__main__":
    # Initialize the EvoPrompting class
    T = 10 # Number of rounds
    m = 1 # number of few-shot prompts per round
    n = 1 # number of samples to generate per prompt,
    k = 3 # number of in-context examples per prompt
    p = 3 # number of survivors to select per generation
    n_evaluations = 2 # Number of times to run each model
    alpha = 96 # TBD (cutoff accuracy for evaluated children)
    task = "create a solution that genreates the best model with the smallest paramter size"
    # environment = "CartPole-v1" # environment of the task
    seed_folder = "/cifar/seeds" # Folder which contains al the initial seed architectures
    lm = "gpt-4.0" # Language model to use for prompt generation

    target_model_factor = 0.90
    target_episodes = 0.95

    evo_prompt = EvoPrompting(lm, task, seed_folder, T, m, k, n, p, alpha,
                              n_evaluations, target_model_factor, target_episodes, seed_evaluation=True,
                              evaluation_path="")
    # Run the main evolutionary loop
    evo_prompt.evolve()

    # evo_prompt.initialize_population()
    # print("evorpompt Global Population: ", evo_prompt.global_population)

    top = evo_prompt.get_top(global_population = evo_prompt.global_population)

    print('top\n')
    for code in top :
      for i in code :
        print(i)

Submitting tasks to the thread pool
Executing code segment
Executing code segment
Iteration 0, Epoch 1, Loss: 3.585379123687744, Accuracy: 7.8125, Val Loss: 6.301713943481445, Val Accuracy: 13.09999942779541
Iteration 0, Epoch 1, Loss: 3.530014991760254, Accuracy: 7.8125, Val Loss: 18.751785278320312, Val Accuracy: 8.0
Iteration 700, Epoch 1, Loss: 1.4140808582305908, Accuracy: 50.73332977294922, Val Loss: 1.1524512767791748, Val Accuracy: 60.000003814697266
Iteration 700, Epoch 1, Loss: 1.789637804031372, Accuracy: 38.20880889892578, Val Loss: 1.4623537063598633, Val Accuracy: 47.20000076293945
Iteration 1400, Epoch 2, Loss: 1.0196714401245117, Accuracy: 64.8277587890625, Val Loss: 1.0305410623550415, Val Accuracy: 65.69999694824219
Iteration 1400, Epoch 2, Loss: 1.3861662149429321, Accuracy: 50.28050994873047, Val Loss: 1.3331040143966675, Val Accuracy: 52.39999771118164
Iteration 2100, Epoch 3, Loss: 0.8919765949249268, Accuracy: 69.53811645507812, Val Loss: 1.0307246446609497, Val 