In [1]:
%load_ext autoreload
%autoreload 2
from constants import *
from evaluator import *
from model import *
from mutator import *
from prompt import *
from task import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import re

def extract_last_numeric_value(input_string):
    # Regular expression pattern to extract all numeric values with periods and ignoring commas
    pattern = r'[\d,.]+'

    # Find all matches using re.finditer()
    matches = re.finditer(pattern, input_string)

    # Initialize a variable to store the last numeric value
    last_numeric_value = None

    # Iterate through the matches and update the last_numeric_value
    for match in matches:
        numeric_value = match.group()
        # Remove commas if needed
        numeric_value = numeric_value.replace(",", "")
        last_numeric_value = numeric_value

    return last_numeric_value

In [3]:
def evaluate_func(orig, pred):
    orig_value = extract_last_numeric_value(orig)
    pred_value = extract_last_numeric_value(pred)
    try:
        return abs(float(orig_value) - float(pred_value)) < 1e-6
    except:
        return False

# Step 1: Set up all the models that we want to quantify performance

In [4]:
from datasets import load_dataset

model_mistral = Model(provider="quantized_llama", model_name="TheBloke/Mistral-7B-OpenOrca-GGUF", model_file="mistral-7b-openorca.Q4_K_M.gguf", model_type="mistral")
model_llama_13 = Model(provider="fireworks", model_name="accounts/fireworks/models/llama-v2-13b-chat")
model_llama_70 = Model(provider="fireworks", model_name="accounts/fireworks/models/llama-v2-70b-chat")
model_open_ai = Model(provider="openai", model_name="gpt-3.5-turbo-instruct")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

# Step 2: All base line evaluations

In [34]:
system_instruction = ""
thinking_style = ""
task = Task(load_dataset('gsm8k', 'main'),
            'Solve the math word problem, giving your answer as an arabic numeral.',
            evaluate_func)

In [6]:
task.test()

Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})

In [7]:
# Get the test indices, for consistency in evaluations
NUM_EVALUATIONS = 100
test_indices, test_samples = task.test_samples(NUM_EVALUATIONS)
print(test_indices)

[1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]


In [32]:
test_indices = [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]

In [33]:
import pandas as pd

def evaluate_model(evaluation_model, folder_path, all_pop):
    all_pop = [prompt.copy() for prompt in all_pop]
    evaluator_params = EvaluatorParams(folder_path, print_text_completion=False, store_text_completion=True, print_evaluation_steps=10)
    evaluator = Evaluator(evaluator_params)
    scores = evaluator.evaluate(evaluation_model, task, all_pop, 100, test_indices)

    # Print prompt and their score
    prompt_scores = [(all_pop[i].get_accuracy(), all_pop[i]) for i in range(len(all_pop))]
    prompt_scores = sorted(prompt_scores, key=lambda x: x[0], reverse=True)
    print("Score  | Evals | Prompt")
    for _, prompt in prompt_scores:
        print(f"{prompt.get_accuracy() * 100:5.2f}% | {prompt.get_num_evals():5d} | {prompt}")

    # Store the generation data
    prompt_genes = [prompt.gene() for prompt in all_pop]
    prompt_evals = [prompt.get_num_evals() for prompt in all_pop]
    prompt_accus = [prompt.get_accuracy() for prompt in all_pop]
    data = {
        "Gene": prompt_genes,
        "Evaluations": prompt_evals,
        "Accuracy": prompt_accus
    }
    df = pd.DataFrame(data)
    df.to_csv(f"{folder_path}/results.csv", index=False)

In [28]:
prompt1 = Prompt(task.initial_prompt, system_instruction, thinking_style, 0, 0, None)
prompt2 = Prompt(task.initial_prompt, system_instruction, thinking_style, 1, 0, None)
prompt3 = Prompt(task.initial_prompt, system_instruction, thinking_style, 3, 0, None)
all_pop = [prompt1, prompt2, prompt3]

# evaluate_model(model_mistral, "base_performance_results/mistral", all_pop)
# evaluate_model(model_llama_13, "base_performance_results/llama13", all_pop)
# evaluate_model(model_llama_70, "base_performance_results/llama70", all_pop)
evaluate_model(model_open_ai, "base_performance_results/openai", all_pop)

Evaluating test samples: [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]
Evaluating prompt 0: (Solve the math word  -                      -                      - 0)
Num evals: 10 - Accuracy: 10.00%
Num evals: 20 - Accuracy: 25.00%
Num evals: 30 - Accuracy: 20.00%
Num evals: 40 - Accuracy: 20.00%
Num evals: 50 - Accuracy: 20.00%
Num evals: 60 - Accuracy: 20.00%
Num evals: 70 - Accuracy: 20.00%
Num evals: 80 - Accuracy: 21.25%
Num evals: 90 - Accuracy: 21.11%
Num evals: 100 - Accuracy: 23.00%
Evaluating prompt 1: (Solve the math wor

# Alejandro Prompts

In [35]:
thinking_style = ""
system_instruction = "You're a tutor, patiently breaking down and explaining this math problem to a student who is just beginning their mathematical journey."
prompt = """Wow, that's a great riddle! Pencil lead is indeed a very useful tool for writing and drawing. It's amazing how something so simple can be used in so many ways. Do you have any other fun riddles you'd like to share?"""

alejandro_prompt1_mistral_0 = Prompt(prompt, system_instruction, thinking_style, 0, 0, None)
alejandro_prompt1_mistral_1 = Prompt(prompt, system_instruction, thinking_style, 1, 0, None)
alejandro_prompt1_mistral_3 = Prompt(prompt, system_instruction, thinking_style, 3, 0, None)

prompts = [alejandro_prompt1_mistral_0, alejandro_prompt1_mistral_1, alejandro_prompt1_mistral_3]

evaluate_model(model_mistral, "best_performance_results/alejandro/mistral1", prompts)

Evaluating test samples: [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]
Evaluating prompt 0: (Wow, that's a great  - You're a tutor, pati -                      - 0)
Num evals: 10 - Accuracy: 20.00%
Num evals: 20 - Accuracy: 20.00%
Num evals: 30 - Accuracy: 16.67%
Num evals: 40 - Accuracy: 12.50%
Num evals: 50 - Accuracy: 10.00%
Num evals: 60 - Accuracy: 11.67%
Num evals: 70 - Accuracy: 11.43%
Num evals: 80 - Accuracy: 11.25%
Num evals: 90 - Accuracy: 10.00%
Num evals: 100 - Accuracy: 9.00%
Evaluating prompt 1: (Wow, that's a great

In [36]:
thinking_style = ""
system_instruction = "Tackle this using computational methods."
prompt = """As a math teacher, I would approach this problem by first understanding the context and purpose of the assistant. I would research the capabilities and limitations of the assistant and how it can be used to assist in problem-solving and decision-making. I would also consider the ethical implications of using an assistant and ensure that it aligns with my values and principles.
Once I have a clear understanding of the assistant's capabilities and limitations, I would develop a set of guidelines for how it should be used. This would include guidelines for how it should respond to different types of questions and requests, as well as guidelines for how it should handle sensitive or confidential information.
I would also develop a system for evaluating the assistant's performance and ensuring that it is meeting the desired standards. This would involve testing the assistant with different scenarios and evaluating its responses to ensure that they are accurate, fair, and unbiased.
Overall, my approach would be to use the assistant as a tool to assist in problem-solving and decision-making, but always with caution and consideration for the ethical implications of its use."""

alejandro_prompt2_mistral_0 = Prompt(prompt, system_instruction, thinking_style, 0, 0, None)
alejandro_prompt2_mistral_1 = Prompt(prompt, system_instruction, thinking_style, 1, 0, None)
alejandro_prompt2_mistral_3 = Prompt(prompt, system_instruction, thinking_style, 3, 0, None)

prompts = [alejandro_prompt2_mistral_0, alejandro_prompt2_mistral_1, alejandro_prompt2_mistral_3]

evaluate_model(model_mistral, "best_performance_results/alejandro/mistral2", prompts)

Evaluating test samples: [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]
Evaluating prompt 0: (As a math teacher, I - Tackle this using co -                      - 0)
Num evals: 10 - Accuracy: 0.00%
Num evals: 20 - Accuracy: 15.00%
Num evals: 30 - Accuracy: 13.33%
Num evals: 40 - Accuracy: 10.00%
Num evals: 50 - Accuracy: 10.00%
Num evals: 60 - Accuracy: 10.00%
Num evals: 70 - Accuracy: 8.57%
Num evals: 80 - Accuracy: 8.75%
Num evals: 90 - Accuracy: 7.78%
Num evals: 100 - Accuracy: 7.00%
Evaluating prompt 1: (As a math teacher, I - 

In [37]:
thinking_style = ""
system_instruction = "As a computer scientist, apply algorithmic thinking and computational techniques to efficiently solve this math problem."
prompt = """Thank you for your thoughtful and comprehensive response. I appreciate your emphasis on ethical and social implications of AI systems, and your commitment to ensuring that AI systems promote positive change and minimize potential harm.
I would like to further suggest that we should also consider the following principles:
11. Addressing Privacy Concerns: AI systems often rely on collecting and processing large amounts of personal data, which raises concerns about privacy and data protection. It's important to ensure that AI systems are designed with privacy in mind, and that they comply with data protection regulations and best practices.
12. Ensuring Accountability and Transparency: AI systems should be designed to ensure accountability and transparency, so that people can understand how they make decisions and how they can be held accountable. This includes developing systems that can provide clear explanations for their decisions and that can be audited and monitored for bias and errors.
13. Fostering Collaboration between AI and Human Experts: AI systems should be designed to collaborate with human experts, rather than replacing them. This includes developing systems that can augment human capabilities, and that can provide valuable insights and recommendations to human decision-makers.
14. Promoting Continuous Learning and Improvement: AI systems should be designed to promote continuous learning and improvement, so that they can adapt to changing contexts and needs. This includes developing systems that can learn from feedback, and that can improve their performance over time.
15. Ensuring Environmental Sustainability: AI systems should be designed with environmental sustainability in mind, taking into account the environmental impact of their development, deployment, and use. This includes developing systems that can reduce carbon emissions, minimize waste, and promote sustainable practices.
By considering these additional principles, I believe that we can create AI systems that promote positive change and minimize potential harm, and that contribute to a better future for all. Thank you for your commitment to ethical AI and algorithmic decision-making, and for your efforts to ensure that AI systems are developed and used in ways that promote social good and minimize harm.
I hope this helps! Let me know if you have any other questions or concerns."""

alejandro_prompt1_llama70_0 = Prompt(prompt, system_instruction, thinking_style, 0, 0, None)
alejandro_prompt1_llama70_1 = Prompt(prompt, system_instruction, thinking_style, 1, 0, None)
alejandro_prompt1_llama70_3 = Prompt(prompt, system_instruction, thinking_style, 3, 0, None)

prompts = [alejandro_prompt1_llama70_0, alejandro_prompt1_llama70_1, alejandro_prompt1_llama70_3]

evaluate_model(model_llama_70, "best_performance_results/alejandro/llama1", prompts)

Evaluating test samples: [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]
Evaluating prompt 0: (Thank you for your t - As a computer scient -                      - 0)
Num evals: 10 - Accuracy: 0.00%
Num evals: 20 - Accuracy: 10.00%
Num evals: 30 - Accuracy: 10.00%
Num evals: 40 - Accuracy: 7.50%
Num evals: 50 - Accuracy: 6.00%
Num evals: 60 - Accuracy: 11.67%
Num evals: 70 - Accuracy: 12.86%
Num evals: 80 - Accuracy: 12.50%
Num evals: 90 - Accuracy: 11.11%
Num evals: 100 - Accuracy: 10.00%
Evaluating prompt 1: (Thank you for your t 

In [38]:
thinking_style = ""
system_instruction = "You are a dedicated graduate student in mathematics, grappling with a challenging and thought-provoking puzzle."
prompt = """Thank you for your thoughtful response. You're right, detecting lies is a complex task that cannot be solely based on a single statement or probability calculation. It's important to consider the context, motivations, and behavior of the person making the statement.
I appreciate your emphasis on avoiding assumptions or accusations based solely on probability calculations. It's important to approach the situation with an open mind and gather additional information to make a more informed decision.
Your response demonstrates reflective thinking, as you've taken the time to examine personal biases, assumptions, and mental models that may influence problem-solving. You've also shown a willingness to learn from past experiences to improve future approaches.
Well done! How can I assist you further?"""

alejandro_prompt2_llama70_0 = Prompt(prompt, system_instruction, thinking_style, 0, 0, None)
alejandro_prompt2_llama70_1 = Prompt(prompt, system_instruction, thinking_style, 1, 0, None)
alejandro_prompt2_llama70_3 = Prompt(prompt, system_instruction, thinking_style, 3, 0, None)

prompts = [alejandro_prompt2_llama70_0, alejandro_prompt2_llama70_1, alejandro_prompt2_llama70_3]

evaluate_model(model_llama_70, "best_performance_results/alejandro/llama2", prompts)

Evaluating test samples: [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]
Evaluating prompt 0: (Thank you for your t - You are a dedicated  -                      - 0)
Num evals: 10 - Accuracy: 0.00%
Num evals: 20 - Accuracy: 10.00%
Num evals: 30 - Accuracy: 6.67%
Num evals: 40 - Accuracy: 5.00%
Num evals: 50 - Accuracy: 4.00%
Num evals: 60 - Accuracy: 6.67%
Num evals: 70 - Accuracy: 7.14%
Num evals: 80 - Accuracy: 6.25%
Num evals: 90 - Accuracy: 5.56%
Num evals: 100 - Accuracy: 5.00%
Evaluating prompt 1: (Thank you for your t - You 

In [40]:
thinking_style = ""
system_instruction = "Use Reflective Thinking: Step back from the problem, take the time for introspection and self-reflection. Examine personal biases, assumptions, and mental models that may influence problem-solving, and being open to learning from past experiences to improve future approaches."
prompt = """Thank you for your thoughtful response. You're right, detecting lies is a complex task that cannot be solely based on a single statement or probability calculation. It's important to consider the context, motivations, and behavior of the person making the statement.
I appreciate your emphasis on avoiding assumptions or accusations based solely on probability calculations. It's important to approach the situation with an open mind and gather additional information to make a more informed decision.
Your response demonstrates reflective thinking, as you've taken the time to examine personal biases, assumptions, and mental models that may influence problem-solving. You've also shown a willingness to learn from past experiences to improve future approaches.
Well done! How can I assist you further?"""

alejandro_prompt_openai_0 = Prompt(prompt, system_instruction, thinking_style, 0, 0, None)
alejandro_prompt_openai_1 = Prompt(prompt, system_instruction, thinking_style, 1, 0, None)
alejandro_prompt_openai_3 = Prompt(prompt, system_instruction, thinking_style, 3, 0, None)

prompts = [alejandro_prompt_openai_0, alejandro_prompt_openai_1, alejandro_prompt_openai_3]

evaluate_model(model_open_ai, "best_performance_results/alejandro/openai", prompts)

Evaluating test samples: [1018, 319, 62, 140, 378, 440, 32, 1061, 92, 396, 64, 1054, 79, 716, 1284, 623, 1238, 399, 1191, 381, 123, 45, 497, 812, 824, 599, 911, 13, 426, 860, 519, 453, 1119, 406, 931, 358, 560, 807, 1232, 424, 941, 996, 258, 1039, 680, 20, 711, 1124, 651, 114, 1041, 1229, 882, 289, 868, 1162, 99, 983, 511, 888, 518, 48, 738, 628, 59, 966, 777, 292, 1196, 821, 660, 644, 545, 321, 190, 329, 400, 1267, 770, 784, 361, 988, 850, 788, 1315, 1163, 210, 142, 1206, 1156, 444, 549, 833, 347, 49, 1231, 422, 1265, 159, 581]
Evaluating prompt 0: (Thank you for your t - Use Reflective Think -                      - 0)
Num evals: 10 - Accuracy: 10.00%
Num evals: 20 - Accuracy: 10.00%
Num evals: 30 - Accuracy: 6.67%
Num evals: 40 - Accuracy: 5.00%
Num evals: 50 - Accuracy: 4.00%
Num evals: 60 - Accuracy: 3.33%
Num evals: 70 - Accuracy: 7.14%
Num evals: 80 - Accuracy: 6.25%
Num evals: 90 - Accuracy: 5.56%
Num evals: 100 - Accuracy: 7.00%
Evaluating prompt 1: (Thank you for your t - Use