In [5]:
from datasets import load_dataset
from openai import OpenAI
from pathlib import Path
import pandas as pd
import httpx
import json
# 从Hugging Face加载数据集
dataset = load_dataset("Lojitha/sl_marraige_law_QA")
#print(dataset)

train_dataset = dataset['train']
#把数据集的前二十行作为训练集
train_df = pd.DataFrame(train_dataset[:5])
questions_answers = train_df[['Question', 'Answer']]

with open('finetune_data_chat_format.jsonl', 'w', encoding='utf-8') as jsonl_file:
    for index, example in questions_answers.iterrows():    
        formatted_data = {         
            "messages": [          
                {"role": "system", "content": "You are a professional legal expert assistant that provides accurate answers to lawrelated questions."},                 
                {"role": "user", "content": str(example['Question']).replace('-', '')},                
                {"role": "assistant", "content": str(example['Answer']).replace('-', '')}            
            ]        
        }
        
        try:
            # 确保JSON编码时不会添加额外的换行符
            json_line = json.dumps(formatted_data, ensure_ascii=False, separators=(',', ':'))
            jsonl_file.write(json_line + '\n')
        except Exception as e:
            print(f"处理第 {index} 行时出错: {str(e)}")
            continue

# 验证生成的文件
print("验证JSONL文件内容:")
with open('finetune_data_chat_format.jsonl', 'r', encoding='utf-8') as f:
    content = f.read()
    print("文件内容预览(前500字符):")
    print(content[:500])
    print("\n检查每行的JSON格式:")
    for i, line in enumerate(content.splitlines()):
        try:
            parsed = json.loads(line)
            print(f"第 {i+1} 行格式正确")
            print(f"用户内容长度: {len(parsed['messages'][1]['content'])} 字符")
        except json.JSONDecodeError as e:
            print(f"第 {i+1} 行格式错误: {str(e)}")

# 使用系统代理设
client = OpenAI(
    api_key="sk-EsEwsfwXcrFXcBPpXBut724Q48acqfiaTG0OfVpIJUaFTsVz",
    base_url="http://api.wlai.vip/v1",
    http_client=httpx.Client(
        verify=False,
        timeout=30.0,  # 增加超时时间到30秒
        transport=httpx.HTTPTransport(retries=3)
    )
)
#添加异常处理
try:
    # 测试API连接
    test_response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[{"role": "user", "content": "Hello"}]
    )
    print("API连接测试成功")
    
    # 文件上传
    with open('finetune_data_chat_format.jsonl', 'rb') as file:
        try:
            # 确保文件对象正确定位到开始位置
            file.seek(0)
            response = client.files.create(
                file=file,
                purpose='fine-tune'
            )
            print("原始响应:", response)
            file_id = response.id if hasattr(response, 'id') else None
            
            if not file_id:
                raise ValueError("无法获取文件ID")
            print(f"获取到的文件ID: {file_id}")
            # 创建微调任务
            fine_tune_response = client.fine_tuning.jobs.create(
                training_file=file_id,
                model="gpt-3.5-turbo-1106"
            )
            print("微调作业已启动,ID为:", fine_tune_response.id)
            
        except Exception as e:
            print(f"文件上传错误: {str(e)}")
            # 添加更详细的错误信息打印
            if hasattr(e, 'response'):
                print(f"错误响应: {e.response.text if hasattr(e.response, 'text') else e.response}")
            raise
            
except Exception as e:
    print(f"发生错误: {str(e)}")
    print(f"错误类型: {type(e)}")

验证JSONL文件内容:
文件内容预览(前500字符):
{"messages":[{"role":"system","content":"You are a professional legal expert assistant that provides accurate answers to lawrelated questions."},{"role":"user","content":"Can I register a place of worship for marriage ceremonies?"},{"role":"assistant","content":"Yes, the minister, proprietor, or trustee of a building used for public Christian worship can apply for its registration to solemnize marriages therein. The application must be accompanied by a declaration signed by at least twenty house

检查每行的JSON格式:
第 1 行格式正确
用户内容长度: 58 字符
第 2 行格式正确
用户内容长度: 90 字符
第 3 行格式正确
用户内容长度: 40 字符
第 4 行格式正确
用户内容长度: 64 字符
第 5 行格式正确
用户内容长度: 54 字符
API连接测试成功
原始响应: <!doctype html>
<html lang="en">

<head>
  <meta charset="utf-8" />
  <link rel="icon" href="/logo.png" />
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <meta name="theme-color" content="#ffffff" />
  <meta name="description"
    content="中转API提供便宜低价的ChatGPT中转API服务。以近乎GPT-3.5的价格使用GPT-4,支持gp