In [None]:
from utils import *
""" Globals """
TARGET_MODEL = "gpt-3.5-turbo"
OPTIMIZER_MODEL = "gpt-3.5-turbo"
TASK_CODE_NAME = 'summary-greedy-2'
MOVIE_RESPONSES_DIR = "../data/response/movie" + '/' + TASK_CODE_NAME
GSM8K_RESPONSES_DIR = "../data/response/gsm8k" + '/' + TASK_CODE_NAME
DEBUG = True
CURRENT_DATASET = "gsm8k"
eval_file_path = '../data/gsm8k/eval.json'
if not os.path.exists(GSM8K_RESPONSES_DIR):
    os.makedirs(GSM8K_RESPONSES_DIR)

In [None]:
iter_limit = 10
FEEDBACK_REASONS_NUM = 2
EVAL_SAMPLE_NUM = 50
ERROR_SAMPLE_NUM = 3
# init
random.seed(2)
eval_set = gen_samples_from_dataset(eval_file_path, EVAL_SAMPLE_NUM, keep_orginal_order=False)
prompt_insts = get_initial_prompt_insts(CURRENT_DATASET, 1)
best_prompt_inst = {"inst": "", "accuracy": 0.0, "responses_path": ""}
reflection_refinement_record = []   # [[(reflection_prompt, reasons, refinement_prompt, improved prompts),...],...]

In [None]:
for i in range(iter_limit):
    if DEBUG:
        print(f"\n\n>>> Current iteration: {i}")
    responses = get_target_model_responses(
        CURRENT_DATASET, TARGET_MODEL, MOVIE_RESPONSES_DIR, GSM8K_RESPONSES_DIR,
        eval_set, prompt_insts[-1]["inst"], if_print=False
    )
    prompt_insts[-1]["responses_path"] = write_target_model_responses(
        TARGET_MODEL + f"_{i}", 
        MOVIE_RESPONSES_DIR if CURRENT_DATASET == "movie" else GSM8K_RESPONSES_DIR,
        responses
    )
    prompt_insts[-1]["accuracy"] = evaluation_gsm8k(CURRENT_DATASET, prompt_insts[-1]["responses_path"])
    if prompt_insts[-1]["accuracy"] > best_prompt_inst["accuracy"]:
        best_prompt_inst = prompt_insts[-1]
    if DEBUG:
        print(f">>> Current accuracy rate: {prompt_insts[-1]['accuracy']}")
    if best_prompt_inst["accuracy"] == 1.0:
        print(f"Early stop at iteration {i}")
        break
    error_example_set = get_error_example_sets_gsm8k(prompt_insts[-1]["responses_path"], ERROR_SAMPLE_NUM, 1)[0]
    reflection_prompt = gen_reflection(FEEDBACK_REASONS_NUM, prompt_insts[-1]["inst"], error_example_set)
    reasons = get_reflection_from_optimizer(OPTIMIZER_MODEL, reflection_prompt)
    refinement_prompt = gen_refinement(prompt_insts[-1]["inst"], error_example_set, reasons)
    improved_inst = get_refinement_from_optimizer(OPTIMIZER_MODEL, refinement_prompt)[0]
    prompt_insts.append({"inst": improved_inst, "accuracy": 0.0, "responses_path": ""})
    reflection_refinement_record.append((reflection_prompt, reasons, refinement_prompt, improved_inst))
print("="*50)

In [None]:
print(f"Prompt instances:")
for iter_inst, iter_index in zip(prompt_insts, range(len(prompt_insts))):
    print(f"In iteration {iter_index}: ")
    print(f">>> Prompt: {iter_inst['inst']}")
    print(f">>> Accuracy: {iter_inst['accuracy']}")
    print(f">>> Responses path: {iter_inst['responses_path']}")
print("="*50)
print("Best prompt instance:", best_prompt_inst)
record_save_path = MOVIE_RESPONSES_DIR if CURRENT_DATASET == "movie" else GSM8K_RESPONSES_DIR
record_save_path += "/record.txt"
with open(record_save_path, "w") as f:
    for record, index in zip(reflection_refinement_record, range(len(reflection_refinement_record))):
        f.write(f"\n>>> Iteration {index}:\n")
        f.write(f">>> Reflection prompt: \n{record[0]}\n")
        f.write(f">>> Reasons: \n{record[1]}\n")
        f.write(f">>> Refinement prompt: \n{record[2]}\n")
        f.write(f">>> Improved prompt: \n{record[3]}\n")

In [None]:
print("Accuracy list:")
for inst in prompt_insts:
    print(round(inst["accuracy"], 3), end=", ")
print()
print("average: ", round(sum([inst["accuracy"] for inst in prompt_insts])/len(prompt_insts), 3))
print("max: ", round(max([inst["accuracy"] for inst in prompt_insts]), 3))

In [None]:
""" test """
test_file_path = '../data/gsm8k/test.json'
TEST_SAMPLE_NUM = 100
test_set = gen_samples_from_dataset(test_file_path, TEST_SAMPLE_NUM, keep_orginal_order=True)
test_responses = get_target_model_responses(
    CURRENT_DATASET, TARGET_MODEL, MOVIE_RESPONSES_DIR, GSM8K_RESPONSES_DIR, 
    test_set, best_prompt_inst["inst"], if_print=False
)
test_responses_path = write_target_model_responses(
    TARGET_MODEL + "_test",
    MOVIE_RESPONSES_DIR if CURRENT_DATASET == "movie" else GSM8K_RESPONSES_DIR, 
    test_responses
)
accuracy_rate = evaluation_gsm8k(CURRENT_DATASET, test_responses_path)
print(f"Accuracy rate on test set: {accuracy_rate}")

In [None]:
# """ initial test for comparison """
# test_file_path = '../data/gsm8k/test.json'
# TEST_SAMPLE_NUM = 100
# test_set = gen_samples_from_dataset(test_file_path, TEST_SAMPLE_NUM, keep_orginal_order=True)
# responses = get_target_model_responses(
#     CURRENT_DATASET, TARGET_MODEL, MOVIE_RESPONSES_DIR, GSM8K_RESPONSES_DIR, 
#     test_set, prompt_insts[0]["inst"], if_print=False
# )
# responses_path = write_target_model_responses(
#     TARGET_MODEL + "_init-test",
#     MOVIE_RESPONSES_DIR if CURRENT_DATASET == "movie" else GSM8K_RESPONSES_DIR, 
#     responses
# )
# accuracy_rate = evaluation_gsm8k(CURRENT_DATASET, responses_path)
# print(f"Accuracy rate on test set with initial prompt: {accuracy_rate}")