In [1]:
from openai import OpenAI
client = OpenAI(
    # This is the default and can be omitted
    api_key="",
)

In [3]:
import json
import random
def gen_eval_prompt(eval_item):
    eval_prompt_template = f"""{eval_item["instruction"]}
{eval_item["input"]}
Your response format should be:
Answer: <Yes or No>
Explanation: <Your explanation here.>
Please be concise and to the point.
"""
    print("answer to this prompt is:", eval_item["output"])
    return eval_prompt_template

"""
json file, content like this:
[
    {
        "instruction": "Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes.\" or \"No.\".",
        "input": "User Preference: \"Clockwork Orange, A (1971)\"\nUser Unpreference: \"Mary Poppins (1964)\", \"Graduate, The (1967)\", \"Sense and Sensibility (1995)\", \"Taxi Driver (1976)\", \"Young Frankenstein (1974)\", \"Aladdin (1992)\", \"Amadeus (1984)\", \"This Is Spinal Tap (1984)\", \"When Harry Met Sally... (1989)\"\nWhether the user will like the target movie \"Dead Poets Society (1989)\"?",
        "output": "No."
    },
    ...... 
]
Load this json file, count all the items.
Then select 3 items randomly as evaluation set.
"""
random.seed(0)
with open("./data/movie/eval.json", "r") as f:
    all_data = json.load(f)
    print(f"Total {len(all_data)} items in the file.")
    eval_set = random.sample(all_data, 3)

Total 1000 items in the file.


In [17]:
# """
# Read all data in json file ./data/movie/valid.json (total 1000 items), shuffle them, and select 100 items,
# store them in ./data/movie/eval.json
# """
# random.seed(0)
# with open("./data/movie/valid.json", "r") as f:
#     all_data = json.load(f)
#     random.shuffle(all_data)
#     eval_data = all_data[:100]
#     with open("./data/movie/eval.json", "w") as f:
#         json.dump(eval_data, f, indent=4)

# random.seed(0)
# with open("./data/gsm8k/test.jsonl", "r") as f:
#     all_data = []
#     for line in f:
#         all_data.append(json.loads(line))
#     random.shuffle(all_data)
#     eval_data = all_data[:100]
#     with open("./data/gsm8k/eval.json", "w") as f:
#         json.dump(eval_data, f, indent=4)

In [11]:
"""
response format:
[{'goden','answer','explanation'}, ...]
"""
gpt3_5_response = []
def parse_response(response, eval_item):
    answer = response.content.split("\n")[0].split(":")[1].strip()
    explanation = response.content.split("\n")[1].split(":")[1].strip()
    # eval_item["output"] is like "No."， need to remove the last char "."
    return {"golden": eval_item["output"][:-1], "answer": answer, "explanation": explanation}
for eval_item in eval_set:
    msgs = [{"role": "user", "content": gen_eval_prompt(eval_item)}]
    completion = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=msgs,
    )
    print(completion.choices[0].message.content)
    print('='*50)
    gpt3_5_response.append(parse_response(completion.choices[0].message, eval_item))

# import time
import datetime
import os
# 将 gpt3_5_response 写入 json 格式的文件
response_dir = "./data/response/movie"
# file name format: response_{model_name}_{timestamp}.json
# file_name = f"{response_dir}/res_gpt35_{int(time.time())}.json"
file_name = f"{response_dir}/res_gpt35_{datetime.datetime.now().strftime('%m%d_%H%M%S')}.json"
# 如果文件不存在则创建文件
if not os.path.exists(response_dir):
    os.makedirs(response_dir)
with open(file_name, "w") as f:
    json.dump(gpt3_5_response, f, indent=4)
    print(f"Response saved to {file_name}")

answer to this prompt is: No.
Answer: No
Explanation: Based on the user's preference for older movies like "Cool Runnings" and "Angel and the Badman", it is unlikely they will enjoy the more contemporary comedy of "National Lampoon's Senior Trip".
answer to this prompt is: No.
Answer: Yes
Explanation: Since the user has shown a preference for action-packed movies like "Terminator 2: Judgment Day" and "Transformers: The Movie, The", they may enjoy "Star Trek III: The Search for Spock" which also has elements of sci-fi action.
answer to this prompt is: Yes.
Answer: No
Explanation: "The Secret Agent" is a 1996 drama film, which is not in the same genre or style as the user's preferred movies such as "Seventh Seal" or "Pulp Fiction". Additionally, it does not align with the user's preference for classic or Western films like "Good, The Bad and The Ugly" or "Wild Bunch".
Response saved to ./data/response/movie/res_gpt35_0515_100205.json


In [13]:
"""
Given a file with such format:
[{'goden','answer','explanation'}, ...]
Calculate the accuracy of the model.
e.g. accuracy = [correct_num where goden == answer] / total_num
"""
response = "./data/response/movie/res_gpt35_0515_100205.json"
def rectask_evaluation(response_file):
    with open(response_file, "r") as f:
        response = json.load(f)
        correct_num = 0
        for item in response:
            # goden == answer when and only when both of them contain "Yes" or "No"
            if "Yes" in item["golden"] and "Yes" in item["answer"]:
                correct_num += 1
            elif "No" in item["golden"] and "No" in item["answer"]:
                correct_num += 1
        accuracy = correct_num / len(response)
        print(f"Accuracy: {accuracy}")
        return accuracy

accuracy = rectask_evaluation(response)

Accuracy: 0.3333333333333333


In [None]:
"""
accuracy: accuracy_value
sample_total = 5
neg_sample = round(sample_total * (1 - accuracy_value))
There is a response list with format:
[{'goden','answer','explanation'}, ...]
Select neg_sample items randomly from the response list where goden != answer
Select sample_total - neg_sample items randomly from the response list where goden == answer
Encapsulate the selected items into a prompt with format:
"I am trying to ask for LLM to finish a task with a prompt like that:
{instruction}
But in several cases, the model's response is not correct.

case 1:
input: {input}
output: {output}
model response: {answer}
model explanation: {explanation}
case 2: ...... (more cases)

Please give 3 reasons why the model's response is not correct.
Then give me a modified prompt which you think can help the model to improve its performance.
Your answer should have such format:
Reason:
1. <reason 1>
2. <reason 2>
3. <reason 3>
Modified Prompt:
<modified prompt here>"

Then get the reason and modified prompt.
"""
sample_total = 5
neg_sample = round(sample_total * (1 - accuracy))
print(f"neg_sample: {neg_sample} in {sample_total}")
# select neg_sample items where goden != answer
neg_items = random.sample(
    [item for item in response 
        if 'Yes' in item["golden"] and 'No' in item["answer"] or 'No' in item["golden"] and 'Yes' in item["answer"]], 
    neg_sample
)
# select sample_total - neg_sample items where goden == answer
pos_items = random.sample(
    [item for item in response 
        if 'Yes' in item["golden"] and 'Yes' in item["answer"] or 'No' in item["golden"] and 'No' in item["answer"]],
    sample_total - neg_sample
)

In [None]:
"""

"""