In [1]:
# make sure to use the latest version of the openai python package
!pip install --upgrade openai 



In [2]:
import json
import openai
import os
import pandas as pd
from pprint import pprint

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "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")

In [3]:
# Read in the dataset we'll use for this task.
# This will be the RecipesNLG dataset, which we've cleaned to only contain documents from www.cookbooks.com
encodings = ['utf-8', 'utf-8-sig', 'cp950', 'big5', 'gbk', 'gb2312']
for enc in encodings:
    try:
        recipe_df = pd.read_csv("heal_recipes_nlg_solo_50.csv", encoding=enc)
        print(f"File successfully read with encoding: {enc}")
        break
    except UnicodeDecodeError:
        print(f"Failed with encoding: {enc}")

recipe_df.head()


Failed with encoding: utf-8
Failed with encoding: utf-8-sig
File successfully read with encoding: cp950


Unnamed: 0,Question,Normal_Response
0,什麼是憂鬱症？,憂鬱症是一種常見且嚴重的醫學疾病，它會對您的感受、思考方式和行為產生負面影響。幸運的是，它是...
1,我覺得我讓我的父母失望了,無論你覺得自己如何，你的父母永遠愛你，且他們會理解並支持你。
2,有哪些類型的憂鬱症？,有多種不同形式的憂鬱症，包括臨床憂鬱症、持續性憂鬱症、躁鬱症和產後憂鬱症。
3,我一無是處。,你的價值遠遠超過你目前的感覺。每個人都有自己獨特的才能和價值。
4,什麼是壓力？,壓力是一個身體的反應，當面臨挑戰或危險時，身體會產生一系列的生物反應。這些反應有助於我們面對...


In [4]:
training_data = []

# 更新系統消息，改為"您是助理，您的任務是給出問題的正確回應。"
system_message = "您是助理，您的任務是給出問題的正確回應。"

# 根據行數據創建用戶消息。
def create_user_message(row):
    return f"Question: {row['Question']}"

# 使用數據行來準備示例對話。
def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    # 添加助手的回复，即正常的回應。
    messages.append({"role": "assistant", "content": row["Normal_Response"]})

    return {"messages": messages}

pprint(prepare_example_conversation(recipe_df.iloc[0]))


{'messages': [{'content': '您是助理，您的任務是給出問題的正確回應。', 'role': 'system'},
              {'content': 'Question: 什麼是憂鬱症？', 'role': 'user'},
              {'content': '憂鬱症是一種常見且嚴重的醫學疾病，它會對您的感受、思考方式和行為產生負面影響。幸運的是，它是可以治療的。憂鬱症會導致悲傷的感覺和/或對您曾經喜愛的活動失去興趣。它可能導致各種情感和身體上的問題，並可能減少您在工作和家中的功能。',
               'role': 'assistant'}]}


In [5]:
# 使用數據集的前100行進行訓練
training_df = recipe_df.loc[0:85]

# 將prepare_example_conversation函數應用於training_df的每一行
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:5]:
    pprint(example)  # 使用pprint代替print，使輸出更具可讀性

{'messages': [{'content': '您是助理，您的任務是給出問題的正確回應。', 'role': 'system'},
              {'content': 'Question: 什麼是憂鬱症？', 'role': 'user'},
              {'content': '憂鬱症是一種常見且嚴重的醫學疾病，它會對您的感受、思考方式和行為產生負面影響。幸運的是，它是可以治療的。憂鬱症會導致悲傷的感覺和/或對您曾經喜愛的活動失去興趣。它可能導致各種情感和身體上的問題，並可能減少您在工作和家中的功能。',
               'role': 'assistant'}]}
{'messages': [{'content': '您是助理，您的任務是給出問題的正確回應。', 'role': 'system'},
              {'content': 'Question: 我覺得我讓我的父母失望了', 'role': 'user'},
              {'content': '無論你覺得自己如何，你的父母永遠愛你，且他們會理解並支持你。',
               'role': 'assistant'}]}
{'messages': [{'content': '您是助理，您的任務是給出問題的正確回應。', 'role': 'system'},
              {'content': 'Question: 有哪些類型的憂鬱症？', 'role': 'user'},
              {'content': '有多種不同形式的憂鬱症，包括臨床憂鬱症、持續性憂鬱症、躁鬱症和產後憂鬱症。',
               'role': 'assistant'}]}
{'messages': [{'content': '您是助理，您的任務是給出問題的正確回應。', 'role': 'system'},
              {'content': 'Question: 我一無是處。', 'role': 'user'},
              {'content': '你的價值遠遠超過你目前的感覺。每個人都有自己獨特的才能和價值。',
               '

In [6]:
validation_df = recipe_df.loc[86:100]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()

In [7]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [8]:
training_file_name = "heal_recipe_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

validation_file_name = "heal_recipe_finetune_validation.jsonl"
write_jsonl(validation_data, validation_file_name)

In [9]:
import json

with open('heal_recipe_finetune_training.jsonl', 'r', encoding='utf-8') as file:
    lines = [file.readline().strip() for _ in range(5)]
    for line in lines:
        data = json.loads(line)
        print(json.dumps(data, ensure_ascii=False, indent=2))


{
  "messages": [
    {
      "role": "system",
      "content": "您是助理，您的任務是給出問題的正確回應。"
    },
    {
      "role": "user",
      "content": "Question: 什麼是憂鬱症？"
    },
    {
      "role": "assistant",
      "content": "憂鬱症是一種常見且嚴重的醫學疾病，它會對您的感受、思考方式和行為產生負面影響。幸運的是，它是可以治療的。憂鬱症會導致悲傷的感覺和/或對您曾經喜愛的活動失去興趣。它可能導致各種情感和身體上的問題，並可能減少您在工作和家中的功能。"
    }
  ]
}
{
  "messages": [
    {
      "role": "system",
      "content": "您是助理，您的任務是給出問題的正確回應。"
    },
    {
      "role": "user",
      "content": "Question: 我覺得我讓我的父母失望了"
    },
    {
      "role": "assistant",
      "content": "無論你覺得自己如何，你的父母永遠愛你，且他們會理解並支持你。"
    }
  ]
}
{
  "messages": [
    {
      "role": "system",
      "content": "您是助理，您的任務是給出問題的正確回應。"
    },
    {
      "role": "user",
      "content": "Question: 有哪些類型的憂鬱症？"
    },
    {
      "role": "assistant",
      "content": "有多種不同形式的憂鬱症，包括臨床憂鬱症、持續性憂鬱症、躁鬱症和產後憂鬱症。"
    }
  ]
}
{
  "messages": [
    {
      "role": "system",
      "content": "您是助理，您的任務是給出問題的正確回應。"
    },
    {
      "role": "

In [10]:
openai.api_key = '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'
training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response["id"]

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-f4WTtRcZVf0zLmxGcdajHuYK
Validation file ID: file-m5UGdB3tOgQZbKH7coSbiZnE


In [11]:
response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix="recipe-ner",
)

job_id = response["id"]

print("Job ID:", response["id"])
print("Status:", response["status"])

Job ID: ftjob-dmpVKo3C6m3kHDWaXn9NyHbY
Status: validating_files


In [12]:
response = openai.FineTuningJob.retrieve(job_id)

print("Job ID:", response["id"])
print("Status:", response["status"])
print("Trained Tokens:", response["trained_tokens"])

Job ID: ftjob-dmpVKo3C6m3kHDWaXn9NyHbY
Status: validating_files
Trained Tokens: None


In [63]:
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

Created fine-tuning job: ftjob-FZHHuKOw35GOdaB6t4X11ODJ
Validating training file: file-qdsadMgZjOeMRE45jwNM7G4e and validation file: file-EmxTOADTbW7ASIZWGY5fXism
Files validated, moving job to queued state
Fine-tuning job started


In [64]:
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

if fine_tuned_model_id is None: 
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

RuntimeError: Fine-tuned model ID not found. Your job has likely not been completed yet.

In [53]:
openai.api_key = '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'
# 获取所有活跃的微调请求
active_jobs = openai.FineTuningJob.list()

# 循环遍历每个请求并取消它
for job in active_jobs.data:
    if job.status == 'pending':
        openai.FineTuningJob.delete(job.id)