In [1]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch07", "03_model-evaluation")
print(file_path)
sys.path.append(file_path)

import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch07/03_model-evaluation


## 测试 OpenAI API (使用deepseek)

In [2]:
def run_chatgpt(prompt, client, model="gpt-4-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        seed=123,
    )
    return response.choices[0].message.content

In [14]:
from openai import OpenAI
from config_manager import config_manager

config = config_manager.config
api_key = config.get('api', {}).get('deepseek_api_key')
if not api_key:
    raise ValueError("需要提供API密钥")
model = config.get('api', {}).get('model', 'deepseek-chat')

# 初始化OpenAI客户端来访问Deepseek API
client = OpenAI(
    api_key=api_key,
    base_url=config.get('api', {}).get('deepseek_api_url')
)

In [6]:
# 准备输入
prompt = f"Respond with 'hello world' if you got this message."
run_chatgpt(prompt, client, model)

'hello world'

## 创建训练用样本保存为json

In [7]:
# 加载测试用原始数据
import json

data_dir = "../../ch07/03_model-evaluation/"
json_file = "eval-example-data.json"

with open(data_dir + json_file, "r") as file:
    json_data = json.load(file)

print("Number of entries:", len(json_data))

Number of entries: 100


In [8]:
json_data[0]

{'instruction': 'Calculate the hypotenuse of a right triangle with legs of 6 cm and 8 cm.',
 'input': '',
 'output': 'The hypotenuse of the triangle is 10 cm.',
 'model 1 response': '\nThe hypotenuse of the triangle is 3 cm.',
 'model 2 response': '\nThe hypotenuse of the triangle is 12 cm.'}

In [9]:
# 格式化输入数据
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Wrtie a response that "
        f"appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""
    
    return instruction_text + input_text

In [11]:
# 少量样本测试
for entry in json_data[:5]:
    text = entry["output"]
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{entry['model 1 response']}`"
        f" on a scale from 0 to 100, where 100 is the best score. "
    )

    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model 1 response"])
    print("\nScore:")
    print(">>", run_chatgpt(prompt, client, model))
    print("\n-------------------------")


Dataset response:
>> The hypotenuse of the triangle is 10 cm.

Model response:
>> 
The hypotenuse of the triangle is 3 cm.

Score:
>> To score the model response, we'll consider the accuracy of the answer compared to the correct output.

**Correct Output:**  
"The hypotenuse of the triangle is 10 cm."  
*(This is correct because, using the Pythagorean theorem: \( \sqrt{6^2 + 8^2} = \sqrt{36 + 64} = \sqrt{100} = 10 \).)*

**Model Response:**  
"The hypotenuse of the triangle is 3 cm."  
*(This is incorrect, as the correct hypotenuse is 10 cm.)*

### Scoring:
- **Accuracy:** The model's answer is completely wrong (3 cm vs. 10 cm).  
- **Relevance:** The response is relevant to the task but fails to provide the correct calculation.  
- **Confidence:** The model states the answer confidently, but it is incorrect.  

Given that the answer is entirely incorrect, the score should reflect a complete failure to solve the task correctly.  

**Score: 0/100**  

*(If the model had provided a part

In [12]:
from tqdm import tqdm

def generate_model_scores(json_data, json_key, client):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{entry[json_key]}`"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the number only."
        )

        score = run_chatgpt(prompt, client, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Error converting score to int: {score}")
            continue
    
    return scores

In [15]:
from pathlib import Path

for m in ("model 1 response", "model 2 response"):
    scores = generate_model_scores(json_data, m, client)
    print(f"\n{m}")
    print(f"Number of scores: {len(scores)} of {len(json_data)}")
    print(f"Average score: {sum(scores)/len(scores):.2f}\n")

    # 保存打分结果
    # save_path = Path(f"model-{model}-scores.json")
    # with open(save_path, "w") as f:
    #     json.dump(scores, file)

Scoring entries:  98%|█████████▊| 98/100 [15:39<00:41, 20.96s/it]

Error converting score to int: The model response correctly classifies the given animals (Eagle, Shark, Cobra) but includes many incorrect and redundant entries (Squirrel, Tiger, repeated Cobra). 

Scoring:
- Correct classifications: 3 (Eagle, Shark, Cobra)
- Incorrect classifications: 9 (Squirrel, Tiger, repeated Cobra)
- Redundant entries: 6 (repeated Cobra, Tiger, Squirrel)

The score should reflect the accuracy of the response relative to the correct output. Since the correct part is only 3 out of 12 lines (25%), but the core task (classifying the 3 given animals) is fully correct, the score should be adjusted to account for both the correctness and the noise.

A fair score would be **50**. 

(Note: If the scoring is strictly based on exact match to the correct output, the score would be lower, but considering the core task is done correctly, 50 is a balanced score.) 

Final answer: `50`


Scoring entries: 100%|██████████| 100/100 [15:55<00:00,  9.56s/it]



model 1 response
Number of scores: 99 of 100
Average score: 81.01



Scoring entries: 100%|██████████| 100/100 [09:00<00:00,  5.40s/it]


model 2 response
Number of scores: 100 of 100
Average score: 68.25






In [16]:
scores

[50,
 100,
 50,
 100,
 50,
 100,
 5,
 100,
 50,
 50,
 60,
 20,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 95,
 50,
 90,
 100,
 100,
 50,
 0,
 100,
 50,
 50,
 100,
 10,
 10,
 95,
 20,
 80,
 0,
 10,
 0,
 100,
 100,
 95,
 100,
 100,
 100,
 10,
 50,
 0,
 50,
 100,
 100,
 100,
 0,
 50,
 100,
 10,
 100,
 85,
 100,
 100,
 20,
 100,
 100,
 100,
 50,
 95,
 10,
 75,
 95,
 100,
 85,
 0,
 85,
 100,
 0,
 20,
 100,
 100,
 100,
 50,
 100,
 50,
 75,
 100,
 100,
 50,
 100,
 10,
 20,
 100,
 90,
 100,
 70,
 100,
 100,
 10,
 100,
 100,
 50,
 10,
 10]