In [1]:
from importlib.metadata import version

pkgs = ["openai",  # OpenAI API
        "tqdm",    # 进度条
       ]

for p in pkgs:
    print(f"{p} version: {version(p)}")

openai version: 1.76.0
tqdm version: 4.67.1


In [2]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch07", "02_dataset-utilities")
print(file_path)
sys.path.append(file_path)

import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch07/02_dataset-utilities


## 测试 OpenAI API (使用deepseek)

In [3]:
def run_chatgpt(prompt, client, model="gpt-4-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
    )
    return response.choices[0].message.content

In [5]:
from openai import OpenAI
from config_manager import config_manager

config = config_manager.config
api_key = config.get('api', {}).get('deepseek_api_key')
if not api_key:
    raise ValueError("需要提供API密钥")
model = config.get('api', {}).get('model', 'deepseek-chat')

# 初始化OpenAI客户端来访问Deepseek API
client = OpenAI(
    api_key=api_key,
    base_url=config.get('api', {}).get('deepseek_api_url')
)

In [7]:
# 准备输入
sentence = "I ate breakfast"
prompt = f"Convert the following sentence to passive voice: '{sentence}'"
run_chatgpt(prompt, client, model)

'The passive voice version of the sentence "I ate breakfast" is:  \n\n**"Breakfast was eaten by me."**  \n\nAlternatively, if the doer (in this case, "me") is not important, you could simply say:  \n**"Breakfast was eaten."**  \n\nLet me know if you\'d like further clarification!'

## 创建训练用样本保存为json

In [8]:
# 加载测试用原始数据
import json

data_dir = "../../ch07/02_dataset-utilities/"
json_file = "instruction-examples.json"

with open(data_dir + json_file, "r") as file:
    json_data = json.load(file)

print("Number of entries:", len(json_data))

Number of entries: 200


In [10]:
# 少量样本测试
for entry in json_data[:5]:
    text = entry["output"]
    prompt = (
        f"Without adding any response or explanation, "
        f"convert the following text to passive voice: {text}"
    )

    print("\nInput:")
    print(">>", text)
    print("\nOutput:")
    print(">>", run_chatgpt(prompt, client, model))
    print("\n-------------------------")



Input:
>> The verb in the sentence is "sleeps."

Output:
>> The verb in the sentence is "slept."

-------------------------

Input:
>> The plural form of "goose" is "geese."

Output:
>> The plural form of "goose" is said to be "geese."

-------------------------

Input:
>> The three primary colors are red, blue, and yellow.

Output:
>> Red, blue, and yellow are considered the three primary colors.

-------------------------

Input:
>> They had finished the game.

Output:
>> The game had been finished by them.

-------------------------

Input:
>> The abbreviation for "Doctor of Philosophy" is Ph.D.

Output:
>> The abbreviation "Ph.D." is used for "Doctor of Philosophy."

-------------------------


In [14]:
# 少量样本测试
from tqdm import tqdm

for i, entry in tqdm(enumerate(json_data[:5]), total=len(json_data[:5])):
    text = entry["output"]
    prompt = (
        f"Without adding any response or explanation, "
        f"convert the following text to passive voice: {text}"
    )
    json_data[i]["output_2"] = run_chatgpt(prompt, client, model)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:20<00:00,  4.11s/it]


In [15]:
json_data[0]

{'instruction': 'Identify the verb in the following sentence: The cat sleeps on the couch.',
 'input': '',
 'output': 'The verb in the sentence is "sleeps."',
 'output_2': 'The verb in the sentence is "slept."'}

In [17]:
# 开始构建数据集
for i, entry in tqdm(enumerate(json_data), total=len(json_data)):
    text = entry["output"]
    prompt = (
        f"Without adding any response or explanation, "
        f"convert the following text to passive voice: {text}"
    )
    json_data[i]["output_2"] = run_chatgpt(prompt, client, model)

100%|██████████| 200/200 [27:08<00:00,  8.14s/it]


In [19]:
# 保存json
new_json_file = json_file.replace(".json", "-modified.json")

with open(new_json_file, "w") as file:
    json.dump(json_data, file, indent=4)