# Step 0: Import classes from other files

In [1]:
%load_ext autoreload
%autoreload 2
from constants import *
from evaluator import *
from model import *
from mutator import *
from prompt import *
from task import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import re

def extract_last_numeric_value(input_string):
    # Regular expression pattern to extract all numeric values with periods and ignoring commas
    pattern = r'[\d,.]+'

    # Find all matches using re.finditer()
    matches = re.finditer(pattern, input_string)

    # Initialize a variable to store the last numeric value
    last_numeric_value = None

    # Iterate through the matches and update the last_numeric_value
    for match in matches:
        numeric_value = match.group()
        # Remove commas if needed
        numeric_value = numeric_value.replace(",", "")
        last_numeric_value = numeric_value

    return last_numeric_value

In [3]:
def evaluate_func(orig, pred):
    orig_value = extract_last_numeric_value(orig)
    pred_value = extract_last_numeric_value(pred)
    try:
        return abs(float(orig_value) - float(pred_value)) < 1e-6
    except:
        return False

# Step 1: Initialize the models and the task

In [4]:
# mutation_model = Model(provider="huggingface", model_name="mistralai/Mistral-7B-Instruct-v0.1")
mutation_model = Model(provider="quantized_llama", model_name="TheBloke/Mistral-7B-OpenOrca-GGUF", model_file="mistral-7b-openorca.Q4_K_M.gguf", model_type="mistral")
evaluation_model = mutation_model


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
from datasets import load_dataset
system_instruction = "You are genius at mathematical thinking and reasoning."
thinking_style = "Think about this step by step."
task = Task(load_dataset('gsm8k', 'main'),
            'Solve the math word problem, giving your answer as an arabic numeral.',
            evaluate_func)
initial_prompt = Prompt(task.initial_prompt, system_instruction, thinking_style, num_examples=0, gene_id=0, previous_mutation=None)

In [5]:
task.dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [6]:
# Sample zero shot prompt
print(initial_prompt._zero_shot_prompt(task.test_sample()['question']))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.
QUESTION: John decides to take up an odd hobby of speed talking.  His normally speaking speed is 150 WPM.  After training his speed is 2.5 times faster than his starting speed.  How long would it take him to speak 10 pages if each page has 450 words per page?
ANSWER: 



In [7]:
# Sample one shot prompt
print(initial_prompt._one_shot_prompt(
    task.test_sample()['question'],
    task.train_sample()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.
QUESTION: Michael has $50. He wants to surprise his mom on Mother's day by buying a cake for $20, a bouquet for $36, and a set of balloons for $5. How much more money does Michael need to buy all those?
ANSWER: Michael will spend $20 + $36 + $5 = $<<20+36+5=61>>61.
So, he needs $61 - $50 = $<<61-50=11>>11 more to buy all those for his mom.
#### 11
QUESTION: Finley took part in a 100-meter race.  She started off in first, but then fell back 5 spots.  She then moved ahead 2 spots, before falling behind 3.  Lastly, she jumped ahead 1 spot to finish the race.  What place did she finish in?
ANSWER: 



In [8]:
# Sample few shot prompt
print(initial_prompt._few_shot_prompt(
    task.test_sample()['question'],
    task.train()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.. 
QUESTION: Together Felipe and Emilio needed a combined time of 7.5 years to build their homes. Felipe finished in half the time of Emilio. How many months did it take Felipe to build his house?
ANSWER: Let F = the number of years Felipe needed to build his house
Emilio = <<2=2>>2F
3F = 7.5 years
F = <<2.5=2.5>>2.5 years
2.5 years = 30 months
It took Felipe 30 months to build his house.
#### 30
QUESTION: Twice Angie's age, plus 4, is 20. How old is Angie?
ANSWER: Twice Angie's age is 20-4=<<20-4=16>>16.
Angie is 16/2=<<16/2=8>>8.
#### 8
QUESTION: Tracy, John and Jake found their combined weight to be 158 kilograms. If Tracy weighs 52 kg and Jake weighs 8kg more than Tracy, what is the range of their weights?
ANSWER: Jake weighs 8kg more than 52 kg which is 8+52 = <<8+52=60>>60 kg
Tracy and Jake together weigh 60+52 =

# Step 2: Initialize a list of initial prompts using the mutator

In [18]:
from datetime import datetime
import os

# Get the current timestamp as a string
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Create the directory
folder_path = f"results/{current_time}"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

gene_folder_path = f"results/{current_time}/genes"
if not os.path.exists(gene_folder_path):
    os.mkdir(gene_folder_path)
    
generation_folder_path = f"results/{current_time}/generation"
if not os.path.exists(generation_folder_path):
    os.mkdir(generation_folder_path)

In [19]:
NUM_POPULATION = 5
NUM_NEW_POPULATION = 5
NUM_CROSSOVER_PAIRS = 2
NUM_GENERATIONS = 10
NUM_EVALS = 10

In [20]:
import random

# Make the list of initial populations
mutation_model = Model(provider="fireworks", model_name="accounts/fireworks/models/mixtral-8x7b-instruct")
initial_pop = [initial_prompt]
prompt_mutator = PromptMutator(mutation_model)
for i in range(1, NUM_POPULATION):
    # Randomly select prompt mutation mechanism by chance
    new_prompt = prompt_mutator.random_mutate(initial_prompt)
    initial_pop.append(new_prompt)

In [21]:
for prompt in initial_pop:
    print(prompt)

(Solve the math word  - You are genius at ma - Think about this ste - 0)
(Swap them out for sy - To find the smallest - Think about this ste - 2)
("Write the response  - You are genius at ma - "Let me ensure I und - 0)
(Who is X and what ar - Implement the soluti - Problem-solving thro - 0)
(Solve this math prob - Focus on understandi - Think about this ste - 0)


# Step 3: Evaluation and mutation loop

In [22]:
n_gen = 0
pop = [prompt.copy() for prompt in initial_pop]

In [23]:
import time
import pandas as pd

start_time = time.time()
params = EvaluatorParams(gene_folder_path, print_text_completion=False, store_text_completion=True)
evaluator = Evaluator(params)
# evaluation_model = mutation_model # Taking too long
evaluation_model = Model(provider="fireworks", model_name="accounts/fireworks/models/mixtral-8x7b-instruct")
# evaluation_model = Model(provider="fireworks", model_name="accounts/fireworks/models/llama-v2-13b-chat") Done
# evaluation_model = Model(provider="fireworks", model_name="accounts/fireworks/models/llama-v2-70b-chat")
# evaluation_model = Model(provider="openai", model_name="gpt-3.5-turbo-instruct") Done

In [24]:
for i in range(1, NUM_GENERATIONS):
    print(f"Gen {i + 1}:")
    print("Mutation phase:")
    # Record generation time
    # Generate n news prompt
    new_pop = []
    for i in range(NUM_NEW_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(prompt)
        new_pop.append(new_prompt)

    for i in range(NUM_CROSSOVER_PAIRS):
        # Randomly select a prompt, a mutator and thinking style
        prompt1, prompt2 = random.sample(pop, 2)

        # Randomly select prompt mutation mechanism by chance
        new_prompt1, new_prompt2 = prompt_mutator.crossover(prompt1, prompt2)
        new_pop.append(new_prompt1)
        new_pop.append(new_prompt2)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Store the generation data
    prompt_genes = [prompt.gene() for prompt in all_pop]
    prompt_evals = [prompt.get_num_evals() for prompt in all_pop]
    prompt_accus = [prompt.get_accuracy() for prompt in all_pop]
    data = {
        "Gene": prompt_genes,
        "Evaluations": prompt_evals,
        "Accuracy": prompt_accus
    }
    df = pd.DataFrame(data)
    df.to_csv(f"{generation_folder_path}/run_{n_gen}.csv", index=False)
    
    # Generation folder
    prompt_mutator.save_ids(f"{gene_folder_path}/component_ids.pkl")
    
    # Survival of the fittest
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]
    
    # Record time
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(execution_time))
    
    # Increment n_gen
    n_gen += 1

Gen 2:
Mutation phase:
Evaluating test samples: [921, 31, 952, 449, 941, 130, 977, 957, 395, 920]
Evaluating prompt 0: (Solve the math word  - You are genius at ma - Think about this ste - 0)
Evaluating prompt 1: (Swap them out for sy - To find the smallest - Think about this ste - 2)
Evaluating prompt 2: ("Write the response  - You are genius at ma - "Let me ensure I und - 0)
Evaluating prompt 3: (Who is X and what ar - Implement the soluti - Problem-solving thro - 0)
Evaluating prompt 4: (Solve this math prob - Focus on understandi - Think about this ste - 0)
Evaluating prompt 5: (Then, I would guide  - You need to select p - <analytical>         - 3)
Evaluating prompt 6: (As we engage with th - Learn from past fail - Think about this ste - 0)
Evaluating prompt 7: (These prompts now in - - What is the signif - Brainstorming: Gener - 3)
Evaluating prompt 8: (Respond to the math  - You are genius at ma - Subtasks can be comb - 0)
Evaluating prompt 9: ("What if X's impact  - for index, 

InvalidRequestError: {'error': 'Credit limit reached. Please set up (additional) payment at https://app.fireworks.ai/users?tab=billing.'}

In [17]:
for i in range(1, NUM_GENERATIONS):
    print(f"Gen {i + 1}:")
    print("Mutation phase:")
    # Record generation time
    # Generate n news prompt
    new_pop = []
    for i in range(NUM_NEW_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(prompt)
        new_pop.append(new_prompt)

    for i in range(NUM_CROSSOVER_PAIRS):
        # Randomly select a prompt, a mutator and thinking style
        prompt1, prompt2 = random.sample(pop, 2)

        # Randomly select prompt mutation mechanism by chance
        new_prompt1, new_prompt2 = prompt_mutator.crossover(prompt1, prompt2)
        new_pop.append(new_prompt1)
        new_pop.append(new_prompt2)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Store the generation data
    prompt_genes = [prompt.gene() for prompt in all_pop]
    prompt_evals = [prompt.get_num_evals() for prompt in all_pop]
    prompt_accus = [prompt.get_accuracy() for prompt in all_pop]
    data = {
        "Gene": prompt_genes,
        "Evaluations": prompt_evals,
        "Accuracy": prompt_accus
    }
    df = pd.DataFrame(data)
    df.to_csv(f"{generation_folder_path}/run_{n_gen}.csv", index=False)
    
    # Generation folder
    prompt_mutator.save_ids(f"{gene_folder_path}/component_ids.pkl")
    
    # Survival of the fittest
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]
    
    # Record time
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(execution_time))
    
    # Increment n_gen
    n_gen += 1

Gen 2:
Mutation phase:
Evaluating test samples: [1238, 187, 429, 418, 249, 777, 44, 944, 39, 41]
Evaluating prompt 0: ("I just want the num - * Limited customizat - Enthusiastic: You ar - 2)
Evaluating prompt 1: (Give the numerical s - Good luck!           - "Plan a surprise bir - 2)
Evaluating prompt 2: (Solve the math word  - You are genius at ma - Think about this ste - 0)
Evaluating prompt 3: (Give the numerical s - You are genius at ma - Think about this ste - 0)
Evaluating prompt 4: (Write down any assum - You have strong skil - Use analogy: "Accomp - 3)
Evaluating prompt 5: (What are the possibl - For example, it can  - Analogies and metaph - 3)
Evaluating prompt 6: (Original prompt: (1) - You are genius at ma - In order to carry ou - 0)
Evaluating prompt 7: (From a broader persp - Good luck!           - Can you list the req - 1)
Evaluating prompt 8: ("Decipher the coded  - Consider the potenti - "I did not see what  - 0)
Evaluating prompt 9: (Instead of providing - Think about 

In [None]:
for pop in all_pop:
    print(pop.get_accuracy())