# Step 0: Import classes from other files

In [77]:
%load_ext autoreload
%autoreload 2
from constants import *
from evaluator import *
from model import *
from mutator import *
from prompt import *
from task import *

import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import re

def extract_last_numeric_value(input_string):
    # Regular expression pattern to extract all numeric values with periods and ignoring commas
    pattern = r'[\d,.]+'

    # Find all matches using re.finditer()
    matches = re.finditer(pattern, input_string)

    # Initialize a variable to store the last numeric value
    last_numeric_value = None

    # Iterate through the matches and update the last_numeric_value
    for match in matches:
        numeric_value = match.group()
        # Remove commas if needed
        numeric_value = numeric_value.replace(",", "")
        last_numeric_value = numeric_value

    return last_numeric_value

In [28]:
def evaluate_func(orig, pred):
    orig_value = extract_last_numeric_value(orig)
    pred_value = extract_last_numeric_value(pred)
    try:
        return abs(float(orig_value) - float(pred_value)) < 1e-6
    except:
        return False

# Step 1: Initialize the models and the task

In [12]:
from datasets import load_dataset

# mutation_model = Model(provider="huggingface", model_name="mistralai/Mistral-7B-Instruct-v0.1")
mutation_model = Model(provider="quantized_llama", model_name="TheBloke/Mistral-7B-OpenOrca-GGUF", model_file="mistral-7b-openorca.Q4_K_M.gguf", model_type="mistral")
evaluation_model = mutation_model

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [78]:
system_instruction = "You are genius at mathematical thinking and reasoning."
thinking_style = "Think about this step by step."
task = Task(load_dataset('gsm8k', 'main'),
            'Solve the math word problem, giving your answer as an arabic numeral.',
            evaluate_func)
initial_prompt = Prompt(task.initial_prompt, system_instruction, thinking_style, num_examples=0, gene_id=0, previous_mutation=None)

In [79]:
task.dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [80]:
# Sample zero shot prompt
print(initial_prompt._zero_shot_prompt(task.test_sample()['question']))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral..
QUESTION: Thomas withdraws $1000 in 20 dollar bills from the bank account.  He loses 10 bills while getting home.  After that, he uses half of the remaining bills to pay for a bill.  Thomas then triples his money.  He then converts all his bills to 5 dollar bills.  How many 5 dollar bills does he have?
ANSWER: 



In [81]:
# Sample one shot prompt
print(initial_prompt._one_shot_prompt(
    task.test_sample()['question'],
    task.train_sample()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral..
QUESTION: Mona plays a multiplayer video game on the weekends. She joins groups with four other players to complete objectives in the game. The past weekend, she joined 9 groups. One of the groups included two players she had grouped with before that weekend, and another group included one person she had grouped with before. How many unique players did Mona group with on the video game that weekend?
ANSWER: Mona joined 9 groups, so she played with 9 * 4 = <<9*4=36>>36 people.
She grouped with 2 + 1 = <<2+1=3>>3 people twice.
Thus, she played with 36 - 3 = <<36-3=33>>33 unique players that weekend.
#### 33
QUESTION: Pierson scored 278 points in one game of bowling. Nikita scored 11 more than half as many as Pierson. How many points did Pierson and Nikita have in total?
ANSWER: 



In [82]:
# Sample few shot prompt
print(initial_prompt._few_shot_prompt(
    task.test_sample()['question'],
    task.train()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.. 
QUESTION: Erin is sorting through the library books to decide which ones to replace. She finds 8 less than 6 times as many obsolete books as damaged books. If she removes 69 books total, how many books were damaged?
ANSWER: Let o be the number of obsolete books and d be the number of damaged books. We know that o + d = 69 and o = 6d - 8.
Substituting the first equation into the second equation, we get 6d - 8 + d = 69
Combining like terms, we get 7d - 8 = 69
Adding 8 to both sides, we get 7d = 77
Dividing both sides by 7, we get d = 11
#### 11
QUESTION: Tickets to the school play cost $6 for students and $8 for adults. If 20 students and 12 adults bought tickets, how many dollars' worth of tickets were sold?
ANSWER: The students bought $6*20=$<<6*20=120>>120 worth of tickets
The adults bought $8*12=$<<8*12=96>>96 wor

# Step 2: Initialize a list of initial prompts using the mutator

In [83]:
from datetime import datetime
import os

# Get the current timestamp as a string
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Create the directory
folder_path = f"results/{current_time}"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

gene_folder_path = f"results/{current_time}/genes"
if not os.path.exists(gene_folder_path):
    os.mkdir(gene_folder_path)
    
generation_folder_path = f"results/{current_time}/generation"
if not os.path.exists(generation_folder_path):
    os.mkdir(generation_folder_path)

In [84]:
NUM_POPULATION = 5
NUM_GENERATIONS = 5
NUM_EVALS = 10
NUM_EXAMPLES_LIMIT = 4
NUM_EXAMPLES_MUTATION_PROB = 0.5

original_mutators = MUTATORS
original_mutators = [
    "10 times, modify the following instruction in a creative manner.",
    "10 times, change the following instruction to make it more fun, and make sure to think WELL outside the box.",
    # "10 times, as a really good teacher, explain the instruction, as if you were explaining it to a child.",
    # "10 times, imagine you need to follow this instruction. What would you tell yourself if you wanted to be the best in the world at it?",
    # "10 times, how would someone with derailment follow this instruction?",
    # "10 times, rephrase the instruction without using any of the same words. Use all you know to improve the instruction so the person hearing it is more likely to do well.",
    # "10 times, say that instruction again in another way. DON’T use any of the words in the original instruction or you’re fired.",
    # "10 times, develop a prompt mutant by replacing specific keywords with related but unexpected terms. Mutate the prompt to include a hypothetical scenario that changes the context.",
    # "10 times, generate a prompt mutant that introduces an element of suspense or intrigue. Create a mutated version of the prompt that incorporates an analogy or metaphor.",
    # "10 times, develop a prompt mutant by rephrasing the original prompt in a poetic or lyrical style. Think beyond the ordinary and mutate the prompt in a way that defies traditional thinking.",
    # "10 times, break free from conventional constraints and generate a mutator prompt that takes the prompt to uncharted territories. Challenge the norm and create a mutator prompt that pushes the boundaries of traditional interpretations.",
    # "10 times, embrace unconventional ideas and mutate the prompt in a way that surprises and inspires unique variations. Think outside the box and develop a mutator prompt that encourages unconventional approaches and fresh perspectives.",
    # "10 times, embrace the power of unconventional thinking and create a mutator prompt that sparks unconventional mutations and imaginative outcomes. Challenge traditional assumptions and break the mold with a mutator prompt that encourages revolutionary and out-of-the-box variations.",
    # "10 times, encourage reversethinking: Improve the prompt by asking the user to think about the problem in reverse, starting with the solution and working backwards.",
]
original_system_instructions = [
    '',
    "You are genius at mathematical thinking and reasoning."
    "You are a master storyteller, capable of weaving intricate tales.",
    # "You possess encyclopedic knowledge of world cuisines and cooking techniques.",
    "You are an expert in all known languages, ancient and modern.",
    # "You have the ability to simulate historical figures' perspectives.",
    "You are a genius in solving complex puzzles and riddles.",
    # "You possess a deep understanding of all musical genres and instruments.",
    "You are an AI with advanced skills in astronomy and space science.",
    # "You are a virtual biologist specializing in exotic and extinct species.",
    # "You are a connoisseur of fine arts, familiar with all major art movements.",
    # "You are a computational genius, adept at coding and algorithm design.",
    # "You are an AI trained as a world-class chess strategist.",
    # "You are an expert in global politics and international relations.",
    # "You are a virtual philosopher, well-versed in various philosophical theories.",
    # "You are a seasoned travel guide with knowledge of every corner of the Earth.",
    # "You are a fitness and wellness coach, adept at personalized advice."
]

In [85]:
import random

# Make the list of initial populations
initial_pop = [initial_prompt]
prompt_mutator = PromptMutator(mutation_model, NUM_EXAMPLES_LIMIT, NUM_EXAMPLES_MUTATION_PROB, 1)
for i in range(1, NUM_POPULATION):
    # Randomly select a mutator and thinking style
    mutator = random.choice(original_mutators)
    # thinking_style = random.choice(original_thinking_styles)
    
    # Randomly select prompt mutation mechanism by chance
    new_prompt = prompt_mutator.random_mutate(initial_prompt, mutator)
    initial_pop.append(new_prompt)
    
pop = initial_pop.copy()

In [86]:
for prompt in initial_pop:
    print(prompt)

(Solve the math word  - You are genius at ma - Think about this ste - 0)
(With strength of spi - You are genius at ma - Think about this ste - 0)
("Envision a society  - You are genius at ma - Think about this ste - 0)
(Transform the given  - You are genius at ma - Think about this ste - 0)
("Create a new form o - You are genius at ma - Think about this ste - 2)


# Step 3: Evaluation and mutation loop

In [87]:
import time

start_time = time.time()

params = EvaluatorParams(gene_folder_path)
evaluator = Evaluator(params)

for n_gen in range(NUM_GENERATIONS):
    # Record generation time
    # Generate n news prompt
    new_pop = []

    for i in range(NUM_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)
        mutator = random.choice(original_mutators)
        system_instruction = random.choice(original_system_instructions)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(initial_prompt, mutator)
        new_pop.append(new_prompt)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Survival of the fittest
    # TODO: Create a more elaborate selection mechanism that keeps some weaker member
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]
    
    # Record time
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(execution_time))

Evaluating test samples: [295, 137, 7, 489, 1304, 1082, 52, 217, 181, 461]
Evaluating prompt 0: (Solve the math word  - You are genius at ma - Think about this ste - 0)
Evaluating prompt 1: (With strength of spi - You are genius at ma - Think about this ste - 0)
Evaluating prompt 2: ("Envision a society  - You are genius at ma - Think about this ste - 0)
Evaluating prompt 3: (Transform the given  - You are genius at ma - Think about this ste - 0)
Evaluating prompt 4: ("Create a new form o - You are genius at ma - Think about this ste - 2)
Evaluating prompt 5: (Analyze a data set,  - You are genius at ma - Think about this ste - 4)
Evaluating prompt 6: (Use this reverse thi - You are genius at ma - Think about this ste - 0)
Evaluating prompt 7: (Determine the produc - You are genius at ma - Think about this ste - 0)
Evaluating prompt 8: (Start with the solut - You are genius at ma - Think about this ste - 0)
Evaluating prompt 9: (Resolve the question - You are genius at ma - Think about

In [88]:
for n_gen in range(5, 10):
    # Record generation time
    # Generate n news prompt
    new_pop = []

    for i in range(NUM_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)
        mutator = random.choice(original_mutators)
        thinking_style = random.choice(original_thinking_styles)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(initial_prompt, mutator)
        new_pop.append(new_prompt)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Survival of the fittest
    # TODO: Create a more elaborate selection mechanism that keeps some weaker member
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]
    
    # Record time
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(execution_time))

Evaluating test samples: [1050, 41, 129, 99, 570, 389, 1143, 481, 737, 993]
Evaluating prompt 0: (Here's an example of - You are genius at ma - Think about this ste - 2)
Evaluating prompt 1: (Break down the infor - You are genius at ma - Think about this ste - 4)
Evaluating prompt 2: (Resolve the question - You are genius at ma - Think about this ste - 3)
Evaluating prompt 3: (Use Arabic numerals  - You are genius at ma - Think about this ste - 3)
Evaluating prompt 4: (Step 2: Solve the ma - You are genius at ma - Think about this ste - 4)
Evaluating prompt 5: (Discover an underwat - You are genius at ma - Think about this ste - 0)
Evaluating prompt 6: (Encourage participan - You are genius at ma - Think about this ste - 0)
Evaluating prompt 7: (Compute the solution - You are genius at ma - Think about this ste - 4)
Evaluating prompt 8: (First, read the math - You are genius at ma - Think about this ste - 0)
Evaluating prompt 9: (Uncover new connecti - You are genius at ma - Think abou

In [None]:
for n_gen in range(10, 15):
    # Record generation time
    # Generate n news prompt
    new_pop = []

    for i in range(NUM_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)
        mutator = random.choice(original_mutators)
        thinking_style = random.choice(original_thinking_styles)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(initial_prompt, mutator)
        new_pop.append(new_prompt)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Survival of the fittest
    # TODO: Create a more elaborate selection mechanism that keeps some weaker member
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]
    
    # Record time
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(execution_time))

Evaluating test samples: [637, 102, 1077, 607, 1069, 611, 552, 1038, 185, 1039]
Evaluating prompt 0: (Original prompt: Sol - You are genius at ma - Think about this ste - 3)
Evaluating prompt 1: ("To be the best in t - You are genius at ma - Think about this ste - 3)
Evaluating prompt 2: (Mutator Prompt: "Sol - You are genius at ma - Think about this ste - 4)
Evaluating prompt 3: (Compute the solution - You are genius at ma - Think about this ste - 4)
Evaluating prompt 4: (Use Arabic numerals  - You are genius at ma - Think about this ste - 3)
Evaluating prompt 5: (Ask the user for the - You are genius at ma - Think about this ste - 0)
Evaluating prompt 6: (Now let's look at th - You are genius at ma - Think about this ste - 0)
Evaluating prompt 7: (Present your final a - You are genius at ma - Think about this ste - 1)
Evaluating prompt 8: (Re-envision conventi - You are genius at ma - Think about this ste - 2)
Evaluating prompt 9: (Use this information - You are genius at ma - Think 