# Step 0: Import classes from other files

In [8]:
%load_ext autoreload
%autoreload 2
from constants import *
from evaluator import *
from model import *
from mutator import *
from prompt import *
from task import *

import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import re

def extract_last_numeric_value(input_string):
    # Regular expression pattern to extract all numeric values with periods and ignoring commas
    pattern = r'[\d,.]+'

    # Find all matches using re.finditer()
    matches = re.finditer(pattern, input_string)

    # Initialize a variable to store the last numeric value
    last_numeric_value = None

    # Iterate through the matches and update the last_numeric_value
    for match in matches:
        numeric_value = match.group()
        # Remove commas if needed
        numeric_value = numeric_value.replace(",", "")
        last_numeric_value = numeric_value

    return last_numeric_value

In [10]:
def evaluate_func(orig, pred):
    orig_value = extract_last_numeric_value(orig)
    pred_value = extract_last_numeric_value(pred)
    try:
        return abs(float(orig_value) - float(pred_value)) < 1e-6
    except:
        return False

# Step 1: Initialize the models and the task

In [14]:
from datasets import load_dataset

mutation_model = Model(provider="quantized_llama", model_name="TheBloke/openinstruct-mistral-7B-GGUF", model_file="openinstruct-mistral-7b.Q4_K_M.gguf", model_type="mistral")
evaluation_model = mutation_model

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)stral-7b.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

In [15]:
system_instruction = "You are genius at mathematical thinking and reasoning."
thinking_style = "Think about this step by step."
task = Task(load_dataset('gsm8k', 'main'),
            'Solve the math word problem, giving your answer as an arabic numeral.',
            evaluate_func)
initial_prompt = Prompt(task.initial_prompt, system_instruction, thinking_style)

In [16]:
task.dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [17]:
# Sample zero shot prompt
print(initial_prompt.zero_shot_prompt(task.test_sample()['question']))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral..
QUESTION: Bob had 7 fish in his ornamental fish pond. 3 were orange, and 4 were white. He decided he wanted to get some more, so he went to the pet store. He had a sales assistant at the pet shop dip out 17 fish out of a mixed tank of both white and orange fish. When he got them home and added them to his pond, he found that he now had twice as many orange fish as white fish. How many white fish did Bob buy at the store?
ANSWER: 



In [18]:
# Sample one shot prompt
print(initial_prompt.one_shot_prompt(
    task.test_sample()['question'],
    task.train_sample()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral..
QUESTION: Kendra wants enough shirts that she only has to do laundry once every two weeks. She wears one shirt to school for each of the five weekdays. Three days a week, she changes into a different shirt for an after-school club. On Saturday, she wears one shirt all day. On Sunday, she wears a different shirt to church than she does for the rest of the day. How many shirts does she need to be able to only do laundry once every two weeks?
ANSWER: Kendra wears 1 * 5 = <<1*5=5>>5 shirts for school days.
She wears 1 * 3 = <<1*3=3>>3 extra shirts for her after-school club.
She wears 1 + 1 = <<1+1=2>>2 shirts on Sunday.
With her shirt on Saturday, she wears 5 + 3 + 1 + 2 = <<5+3+1+2=11>>11 shirts each week.
She wants to have 2 weeks’ worth of shirts, so she needs 11 * 2 = <<11*2=22>>22 shirts.
#### 22
QUESTION: John deci

In [19]:
# Sample few shot prompt
print(initial_prompt.few_shot_prompt(
    task.test_sample()['question'],
    task.train()))

SYSTEM: You are genius at mathematical thinking and reasoning.
USER: Think about this step by step., Solve the math word problem, giving your answer as an arabic numeral.. 
QUESTION: CJ, KJ, and AJ collect stamps.  CJ has 5 more than twice the number of stamps that KJ has, and KJ has half as many as AJ.  If the three boys have 930 stamps all together, how many stamps does AJ have?
ANSWER: Let x represent the number of stamps for AJ
KJ:x/2 stamps
CJ:5+2(x/2)=5+x
Total:x+(x/2)+5+x=930
(5/2)x+5=930
(5/2)x=925
x=925(2/5)=370 stamps
#### 370
QUESTION: A baker bought cooking ingredients in the supermarket. She bought 3 boxes of flour that cost $3 each box, 3 trays of eggs that cost $10 for each tray, 7 liters of milk that cost $5 each liter, and 2 boxes of baking soda that cost $3 each box. How much will she have to pay for everything?
ANSWER: The total cost for the flour is 3 x $3 = $<<3*3=9>>9.
The total cost of the tray of eggs is 3 x $10 = $<<3*10=30>>30.
The total cost for the liter of 

# Step 2: Initialize a list of initial prompts using the mutator

In [20]:
NUM_POPULATION = 5
NUM_GENERATIONS = 5
NUM_EVALS = 10

# original_mutators = MUTATORS
original_mutators = [
    "Rewrite the following instruction 10 times in a creative way",
]

original_thinking_styles = THINKING_STYLES

In [21]:
import random

# Make the list of initial populations
initial_pop = [initial_prompt]
prompt_mutator = PromptMutator(mutation_model)
for i in range(1, NUM_POPULATION):
    # Randomly select a mutator and thinking style
    mutator = random.choice(original_mutators)
    thinking_style = random.choice(original_thinking_styles)
    
    # Randomly select prompt mutation mechanism by chance
    new_prompt = prompt_mutator.random_mutate(initial_prompt, mutator)
    initial_pop.append(new_prompt)
    
pop = initial_pop.copy()

In [22]:
for prompt in initial_pop:
    print(prompt)

Solve the math word problem, giving your answer as an arabic numeral.
The question is: A man has 2000 marbles that are each worth $25 and needs to get exactly 85 marbles out of the total pile for a job he’s being paid $5 per marble for. What's his take-home pay from this job?
The answer is 40
(C) 18
Step-by-step reasoning process: You have 50 - 20 = 30 dollars left.


# Step 3: Evaluation and mutation loop

In [None]:
evaluator = Evaluator()

for n_gen in range(NUM_GENERATIONS):
    # Generate n news prompt
    new_pop = []

    for i in range(NUM_POPULATION):
        # Randomly select a prompt, a mutator and thinking style
        prompt = random.choice(pop)
        mutator = random.choice(original_mutators)
        thinking_style = random.choice(original_thinking_styles)

        # Randomly select prompt mutation mechanism by chance
        new_prompt = prompt_mutator.random_mutate(initial_prompt, mutator)
        new_pop.append(new_prompt)
    
    # Evaluate the prompt
    all_pop = pop + new_pop
    scores = evaluator.evaluate(evaluation_model, task, "one", all_pop, NUM_EVALS)
    
    # Print prompt and their score
    print(f"Generation {n_gen + 1}")
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")
        
    # Survival of the fittest
    # TODO: Create a more elaborate selection mechanism that keeps some weaker member
    pop = [prompt_scores[i][1] for i in range(NUM_POPULATION)]

Evaluating test samples: [59, 931, 523, 987, 399, 128, 310, 488, 1012, 806]
Evaluating prompt 0: Solve the math word problem, giving your answer as an arabic numeral.
Evaluating prompt 1: The question is: A man has 2000 marbles that are each worth $25 and needs to get exactly 85 marbles out of the total pile for a job he’s being paid $5 per marble for. What's his take-home pay from this job?
Evaluating prompt 2: The answer is 40
Evaluating prompt 3: (C) 18
Evaluating prompt 4: Step-by-step reasoning process: You have 50 - 20 = 30 dollars left.
Evaluating prompt 5: The answer is 4.
Evaluating prompt 6: The answer is 4.
Evaluating prompt 7: The answer is 4859
Evaluating prompt 8: Your answer is 7.
Evaluating prompt 9: (C) 20
Generation 1
Score  | Evals | Prompt
40.00% |    10 | The answer is 4.
30.00% |    10 | The answer is 4859
20.00% |    10 | The answer is 40
20.00% |    10 | Step-by-step reasoning process: You have 50 - 20 = 30 dollars left.
20.00% |    10 | The answer is 4.
20.00% 