Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
Guideline: {guideline}
"""

SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')
SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')

# Check if on a windows terminal that supports ANSI escape codes
def supports_ansi():
Expand Down Expand Up @@ -80,9 +80,9 @@ def parse_eval_content(content):
exit(1)


def evaluate_summary_screenshot(guideline):
'''Load the summary screenshot and return True or False if it meets the given guideline.'''
with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file:
def evaluate_final_screenshot(guideline):
'''Load the final screenshot and return True or False if it meets the given guideline.'''
with open(SCREENSHOT_PATH, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

eval_message = [{
Expand Down Expand Up @@ -116,9 +116,9 @@ def run_test_case(objective, guideline):
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)

try:
result = evaluate_summary_screenshot(guideline)
result = evaluate_final_screenshot(guideline)
except(OSError):
print("Couldn't open the summary screenshot")
print("[Error] Couldn't open the screenshot for evaluation")
return False

return result
Expand All @@ -143,7 +143,7 @@ def main():
failed += 1

print(
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed"
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
)

if __name__ == "__main__":
Expand Down