# Step 0: Import classes from other files

In [1]:
%load_ext autoreload
%autoreload 2
from constants import *
from evaluator import *
from model import *
from mutator import *
from prompt import *
from task import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import re

def extract_last_numeric_value(input_string):
    # Regular expression pattern to extract all numeric values with periods and ignoring commas
    pattern = r'[\d,.]+'

    # Find all matches using re.finditer()
    matches = re.finditer(pattern, input_string)

    # Initialize a variable to store the last numeric value
    last_numeric_value = None

    # Iterate through the matches and update the last_numeric_value
    for match in matches:
        numeric_value = match.group()
        # Remove commas if needed
        numeric_value = numeric_value.replace(",", "")
        last_numeric_value = numeric_value

    return last_numeric_value

In [3]:
def evaluate_func(orig, pred):
    orig_value = extract_last_numeric_value(orig)
    pred_value = extract_last_numeric_value(pred)
    try:
        return abs(float(orig_value) - float(pred_value)) < 1e-6
    except:
        return False

# Step 1: Initialize the models and the task

In [4]:
from datasets import load_dataset

# mutation_model = Model(provider="huggingface", model_name="mistralai/Mistral-7B-Instruct-v0.1")
mutation_model = Model(provider="quantized_llama", model_name="TheBloke/Mistral-7B-OpenOrca-GGUF", model_file="mistral-7b-openorca.Q4_K_M.gguf", model_type="mistral")
evaluation_model = mutation_model


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
system_instruction = "You are genius at mathematical thinking and reasoning."
thinking_style = "Think about this step by step."
task = Task(load_dataset('gsm8k', 'main'),
            'Solve the math word problem, giving your answer as an arabic numeral.',
            evaluate_func)
initial_prompt = Prompt(task.initial_prompt, system_instruction, thinking_style, num_examples=0, gene_id=0, previous_mutation=None)

In [6]:
task.dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [7]:
# Sample zero shot prompt
print(initial_prompt._zero_shot_prompt(task.test_sample()['question']))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.
QUESTION: Misha picks out 4 blouses from the 30% off rack.  The regular price for each blouse is $20.  How much is the total cost of the discounted blouses?
ANSWER: 



In [8]:
# Sample one shot prompt
print(initial_prompt._one_shot_prompt(
    task.test_sample()['question'],
    task.train_sample()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.
QUESTION: Tim has some cans of soda. Jeff comes by, and takes 6 cans of soda from Tim. Tim then goes and buys another half the amount of soda cans he had left. If Tim has 24 cans of soda in the end, how many cans of soda did Tim have at first?
ANSWER: Let x be the number of cans Tim has at first.
After Jeff comes by, Tim has x-6 cans left.
Tim buys another (x-6)/2 cans.
x-6+(x-6)/2=24
2*x-12+x-6=48
3*x-18=48
3*x=66
x=<<22=22>>22
#### 22
QUESTION: Two girls each got 1/6 of the 24 liters of water. Then a boy got 6 liters of water. How many liters of water were left?
ANSWER: 



In [9]:
# Sample few shot prompt
print(initial_prompt._few_shot_prompt(
    task.test_sample()['question'],
    task.train()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.. 
QUESTION: A pipe is clogged so that only 2 ounces of cleaner can run through the pipe per minute. After fifteen minutes, the cleaner has unclogged it enough that 3 ounces can run through per minute. Ten minutes later, the clog is cleared enough for 4 ounces to run through per minute. How many ounces of cleaner were used after 30 minutes?
ANSWER: For 15 minutes at 2 ounces per minute, 2 * 15 = <<2*15=30>>30 ounces were used.
For the next 10 minutes at 3 ounces per minute, 3 * 10 = <<3*10=30>>30 ounces were used.
There are 30 - 15 - 10 = <<30-15-10=5>>5 minutes remaining.
For the next 5 minutes at 4 ounces per minute, 4 * 5 = <<4*5=20>>20 ounces were used.
After 30 minutes, 30 + 30 + 20 = <<30+30+20=80>>80 ounces of cleaner were used.
#### 80
QUESTION: A train travels 270 miles in 3 hours. At the same rate, how many a

# Step 2: Initialize a list of initial prompts using the mutator

In [30]:
from datetime import datetime
import os

# Get the current timestamp as a string
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Create the directory
folder_path = f"results/{current_time}"
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

gene_folder_path = f"results/{current_time}/genes"
if not os.path.exists(gene_folder_path):
    os.mkdir(gene_folder_path)
    
generation_folder_path = f"results/{current_time}/generation"
if not os.path.exists(generation_folder_path):
    os.mkdir(generation_folder_path)

In [31]:
NUM_POPULATION = 10
NUM_NEW_POPULATION = 10
NUM_CROSSOVER_PAIRS = 5
NUM_GENERATIONS = 10
NUM_EVALS = 10

In [33]:
import random

# Make the list of initial populations
initial_pop = [initial_prompt]
prompt_mutator = PromptMutator(mutation_model)
for i in range(1, NUM_POPULATION):
    # Randomly select prompt mutation mechanism by chance
    new_prompt = prompt_mutator.random_mutate(initial_prompt)
    initial_pop.append(new_prompt)

In [34]:
for prompt in initial_pop:
    print(prompt)

(Solve the math word  - You are genius at ma - Think about this ste - 0)
(Now we're ready for  - Use the question to  - Think about this ste - 1)
(For example, turning - Re-read and redefine - What are the underly - 2)
(The third prompt hig - Analyze the data and - Does the problem inv - 0)
(Step 2: Convert the  - Use your knowledge a - Creative Thinking: E - 1)
(Picture a situation  - Celebrate small vict - In this revised chal - 2)
(Summarize the main p - Analyze the problem  - "Remembering the suc - 0)
("Break down the prob - Practice mathematica - Challenging assumpti - 0)
(How many more apples - Analyze the context  - Promote collaboratio - 1)
(You have 20 apples a - You are genius at ma - Systemizing (logic b - 3)


# Step 3: Evaluation and mutation loop

In [35]:
n_gen = 0
pop = [prompt.copy() for prompt in initial_pop]

In [None]:
import time
import pandas as pd

start_time = time.time()
params = EvaluatorParams(gene_folder_path, print_text_completion=True, store_text_completion=True)
evaluator = Evaluator(params)
# evaluation_model = mutation_model
evaluation_model = Model(provider="fireworks", model_name="accounts/fireworks/models/llama-v2-13b-chat")
# evaluation_model = Model(provider="fireworks", model_name="accounts/fireworks/models/llama-v2-70b-chat")
# evaluation_model = Model(provider="openai", model_name="gpt-3.5-turbo-instruct")

for i in range(NUM_GENERATIONS):
    print(f"Gen {i + 1}:")
    print("Mutation phase:")
    # Record generation time
    # Generate n news prompt
    new_pop = []
    for i in range(NUM_NEW_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(prompt)
        new_pop.append(new_prompt)

    for i in range(NUM_CROSSOVER_PAIRS):
        # Randomly select a prompt, a mutator and thinking style
        prompt1, prompt2 = random.sample(pop, 2)

        # Randomly select prompt mutation mechanism by chance
        new_prompt1, new_prompt2 = prompt_mutator.crossover(prompt1, prompt2)
        new_pop.append(new_prompt1)
        new_pop.append(new_prompt2)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Store the generation data
    prompt_genes = [prompt.gene() for prompt in all_pop]
    prompt_evals = [prompt.get_num_evals() for prompt in all_pop]
    prompt_accus = [prompt.get_accuracy() for prompt in all_pop]
    data = {
        "Gene": prompt_genes,
        "Evaluations": prompt_evals,
        "Accuracy": prompt_accus
    }
    df = pd.DataFrame(data)
    df.to_csv(f"{generation_folder_path}/run_{n_gen}.csv", index=False)
    
        
    # Survival of the fittest
    # TODO: Create a more elaborate selection mechanism that keeps some weaker member
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]
    
    # Record time
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(execution_time))
    
    # Increment n_gen
    n_gen += 1

Gen 1:
Mutation phase:
