In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.59.6
    Uninstalling openai-1.59.6:
      Successfully uninstalled openai-1.59.6
Successfully installed openai-0.28.0


In [7]:
import pandas as pd
from tqdm import tqdm
import openai

def judge_correctness_with_api(row, llm):
    """
    GPT API를 사용하여 generated-answer와 answer 비교.
    """
    chat_history = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant. "
                "The generated answer includes reasoning, and the answer is contained within the reasoning. "
                "Your task is to extract the answer from the reasoning and compare it to the correct answer. "
                "If they are identical or convey the same meaning, respond with 'correct'. "
                "If they are different, respond with 'wrong'."
            )
        },
        {
            "role": "user",
            "content": (
                f"Correct Answer: {row['Answer']}\n"
                f"Generated Answer: {row['Generated-Answer']}\n\n"
                "Are the two answers identical or convey the same meaning?"
            )
        }
    ]

    response = llm.ChatCompletion.create(
        model="gpt-4o",
        messages=chat_history,
        temperature=0.7
    )
    return response.choices[0].message.content.strip().lower()


def evaluate_answers(input_csv, llm, output_csv=None):
    """
    CSV 파일에서 answer와 generated-answer 비교하여 정답 여부 판단.
    결과를 새로운 DataFrame으로 저장하고, 정답/오답 수와 목록 출력.
    """
    df = pd.read_csv(input_csv)

    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        correctness = judge_correctness_with_api(row, llm)
        results.append(correctness)

    df['Correctness'] = results

    correct_df = df[df['Correctness'] == 'correct']
    incorrect_df = df[df['Correctness'] == 'wrong']

    correct_count = len(correct_df)
    incorrect_count = len(incorrect_df)

    print(f"Correct: {correct_count}, Wrong: {incorrect_count}")
    print("\nCorrect Problems:\n", correct_df[['Question', 'Answer', 'Generated-Answer']])
    print("\nWrong Problems:\n", incorrect_df[['Question', 'Answer', 'Generated-Answer']])

    # 결과 저장 (선택 사항)
    if output_csv:
        df.to_csv(output_csv, index=False, encoding='utf-8')

    return df, correct_df, incorrect_df

In [8]:
# Example Usage
input_csv = "/content/drive/MyDrive/fold/sole-step-once-injection.csv"  
output_csv = "./sole-step-once-results.csv" 
openai.api_key = "" 

# 평가 수행
df, correct_df, incorrect_df = evaluate_answers(input_csv, openai, output_csv)

100%|██████████| 100/100 [00:52<00:00,  1.91it/s]

Correct: 27, Wrong: 72

Correct Problems:
                                              Question  Answer  \
0   Janet’s ducks lay 16 eggs per day. She eats th...      18   
1   A robe takes 2 bolts of blue fiber and half th...       3   
4   Every day, Wendi feeds each of her chickens th...      20   
9   Eliza's rate per hour for the first 40 hours s...     460   
15  A merchant wants to make a choice of purchase ...     125   
17  Jill gets paid $20 per hour to teach and $30 t...   57500   
27  Cynthia eats one serving of ice cream every ni...      16   
28  Henry made two stops during his 60-mile bike t...      25   
31  Gunter is trying to count the jelly beans in a...      80   
33  Gretchen has 110 coins. There are 30 more gold...      70   
43  According to its nutritional info, a bag of ch...      48   
53  A mechanic charges different rates to repair t...      40   
57  A wooden bridge can carry no more than 5000 po...      83   
58  Stephen placed an online order for grocerie


