### 说明

本实验测试微量数据 (训练测试都使用同样的 100 条数据) 的微调效果

### 数据预处理

In [1]:
import json
from collections import Counter
import random
from typing import List, Dict

import pandas as pd
from sklearn.metrics import classification_report
from xinference.client import Client
import concurrent.futures


#### 处理成需要的数据集并保存

以下可跳过

数据集具体信息见 ./RAG/rag_v0.ipynb

In [3]:
with open('../datasets/QA_source.json', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

In [6]:
# 筛选样本数大于50的类别
categories = {
    category for category, count in Counter(item['category'] for item in data).items()
    if count > 50
}
# 随机抽取6个不同的键作为接下来的标签
categories = random.sample(list(categories), 6)

# 随机指定每个类别的个数, sum = 100
cnts = [34, 21, 14, 10, 8, 13]

selected: Dict = {}
# 收集每个类别的所有问题
category_questions = {}
for item in data:
    if item['category'] in categories:
        category_questions.setdefault(item['category'], []).append(item['question'])

# 从每个类别中随机抽取指定数量的样本
result = []
for category, count in zip(categories, cnts):
    questions = random.sample(category_questions[category], count)
    result.extend({'question': q, 'category': category} for q in questions)

with open('datasets.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=4, ensure_ascii=False)

#### 读取数据

取出数据

In [2]:
with open('datasets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
counter = Counter(item['category'] for item in data)

In [4]:
counter

Counter({'债权债务': 34, '建设工程': 21, '婚姻家庭': 14, '劳动纠纷': 13, '公司法': 10, '合同纠纷': 8})

In [5]:
counter.keys()

dict_keys(['债权债务', '建设工程', '婚姻家庭', '公司法', '合同纠纷', '劳动纠纷'])

### 测试

#### xinference 运行命令

```powershell
# 启动 xinference
xinference-local --host 0.0.0.0 --port 9997
# 列出所有 Xinference 支持的指定类型的模型：
xinference registrations -t LLM
# 列出所有在运行的模型：
xinference list
# 停止某个正在运行的模型：
xinference terminate --model-uid "DeepSeek-R1-Distill-Qwen-1.5B"
```

#### 实测

用提示词测试原生法律问题判断的能力

In [6]:
prompt = """你是一位法律咨询专家，能够根据用户提问直接判断用户的咨询类别。
咨询类别为: '债权债务': 0, '建设工程': 1, '婚姻家庭': 2, '公司法': 3, '合同纠纷': 4, '劳动纠纷': 5
输出要求:你只需要输出对应类别的数字
示例:
用户输入: {}
规范输出: 
"""

In [7]:
client = Client('http://localhost:9997')
# model = client.get_model(model_uid="DeepSeek-R1-Distill-Qwen-1.5B")
# 原始模型
model = client.get_model(model_uid="qwen2.5-1.5b-instruct")

#### 单线程

In [12]:
result = []
for q in data[:5]:
    completion = model.chat(
        messages=[{
            'role': 'user',
            'content': prompt.format(q['question'])
        }],
        generate_config={"temperature": 0.0, "max_tokens": 512}
    )

    answer = completion["choices"][0]["message"]["content"]
    # 有思考过程的需要做截断
    filtered = answer.split("</think>")[-1].strip()
    result.append(filtered)
    print(filtered)

2
4
0
4
0


#### 多线程

In [8]:
# 多线程处理函数
def process_question(q):
    completion = model.chat(
        messages=[{
            'role': 'user',
            'content': prompt.format(q['question'])
        }],
        generate_config={"temperature": 0.0, "max_tokens": 2048}
    )
    answer = completion["choices"][0]["message"]["content"]
    return answer


In [9]:
result = []
# 使用 ThreadPoolExecutor 来并发处理
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_question, q) for q in data]
    for future in concurrent.futures.as_completed(futures):
        answer = future.result()
        filtered = answer.split("</think>")[-1].strip()
        result.append(filtered)
        print(filtered)

0
0
1
0
0
0
0
0
0
0
4
0
0
0
0
0
0
0
3
0
0
0
0
1
1
0
0
0
1
0
0
0
2
0
0
0
5
0
0
1
0
1
1
2
1
0
5
1
1
2
1
2
0
1
0
0
1
2
0
1
2
2
2
1
2
0
0
4
0
0
1
0
0
0
2
2
2
0
2
2
0
5
4
0
0
0
1
4
2
5
2
1
0
1
0
2
0
0
0
2


#### 获得结果

将参考标签转化为数值

In [10]:
label2idx = {'债权债务': 0, '建设工程': 1, '婚姻家庭': 2, '公司法': 3, '合同纠纷': 4, '劳动纠纷': 5}
references = [label2idx[i['category']] for i in data]

In [11]:
references

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]

将预测标签转化为数值

In [12]:
# result = [int(i) for i in result]
int_result = []
for i in result:
    # 由于ds有思考过程，可能会出现输出的token过长导致被截断, 所以将产生过长答案的结果设置为 Bad answer 一类
    # 也可以在模型配置中将ds的推理部分关闭就不用做截断了
    try:
        int_result.append(int(i))
    except:
        print('Bad answer!')
        int_result.append(6)


In [13]:
int_result

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 5,
 0,
 0,
 1,
 0,
 1,
 1,
 2,
 1,
 0,
 5,
 1,
 1,
 2,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 2,
 2,
 1,
 2,
 0,
 0,
 4,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 2,
 0,
 5,
 4,
 0,
 0,
 0,
 1,
 4,
 2,
 5,
 2,
 1,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 2]

转换为dataframe形式查看当前数据

In [None]:
df = pd.DataFrame()

#### 评估原生结果

##### ds-distilled

In [18]:
target_names = ['债权债务', '建设工程', '婚姻家庭', '公司法', '合同纠纷', '劳动纠纷', 'Bad answer']

In [23]:
print(classification_report(
    y_true=references,
    y_pred=int_result,
    digits=3,
    target_names=target_names,
    zero_division=0
))

              precision    recall  f1-score   support

        债权债务      0.636     0.412     0.500        34
        建设工程      0.600     0.143     0.231        21
        婚姻家庭      0.364     0.286     0.320        14
         公司法      0.143     0.100     0.118        10
        合同纠纷      0.079     0.375     0.130         8
        劳动纠纷      0.200     0.231     0.214        13
  Bad answer      0.000     0.000     0.000         0

    accuracy                          0.280       100
   macro avg      0.289     0.221     0.216       100
weighted avg      0.440     0.280     0.313       100



##### qwen2.5-1.5b-instruct

In [14]:
target_names = ['债权债务', '建设工程', '婚姻家庭', '公司法', '合同纠纷', '劳动纠纷']

In [15]:
print(classification_report(
    y_true=references,
    y_pred=int_result,
    digits=3,
    target_names=target_names,
    zero_division=0
))

              precision    recall  f1-score   support

        债权债务      0.500     0.794     0.614        34
        建设工程      0.421     0.381     0.400        21
        婚姻家庭      0.278     0.357     0.312        14
         公司法      0.000     0.000     0.000        10
        合同纠纷      0.250     0.125     0.167         8
        劳动纠纷      0.250     0.077     0.118        13

    accuracy                          0.420       100
   macro avg      0.283     0.289     0.268       100
weighted avg      0.350     0.420     0.365       100



### 微调

#### 指令数据集准备

In [None]:
instruct_data = []

for idx, i in enumerate(data):
    instruction = prompt.format(i['question'])
    input_str = ''
    output_str = str(references[idx])
    instruct_data.append({
        'instruction': instruction,
        'input': input_str,
        'output': output_str
    })

In [None]:
len(instruct_data)

In [None]:
with open('instruction_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(instruct_data, f, indent=4, ensure_ascii=False)

当前保存在 `/root/LLaMA-Factory/data/legal_QA.json`

修改 `/root/LLaMA-Factory/data/dataset_info.json` 文件: 添加一个指令数据集的描述，如下
```json
  "legal_QA": {
    "file_name": "legal_QA.json"
  },
````
键将作为后续配置文件的数据集名称使用

#### 微调参数文件准备

当前保存在 `/root/LLaMA-Factory/examples/train_lora/llama3_lora_sft_qwen25.yaml`

```yaml
### model
model_name_or_path: /root/qwen2.5-1.5b-instruct
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 8
lora_target: all
deepspeed: examples/deepspeed/ds_z0_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]

### dataset
# dataset: identity,alpaca_en_demo
dataset: legal_QA
template: llama3
cutoff_len: 2048
max_samples: 1000
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: /root/save/qwen2.5-1.5b-instruct/lora/sft
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true
save_only_model: false
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
resume_from_checkpoint: null
```

#### 训练命令

FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_qwen25.yaml

训练完成的权重保存在 `/root/save/qwen2.5-1.5b-instruct/lora/sft`

#### 合并模型和适配器

配置文件: `LLaMA-Factory/examples/merge_lora/qwen2_5instruct_lora_sft.yaml`

```yaml
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters

### model
model_name_or_path: /root/qwen2.5-1.5b-instruct
adapter_name_or_path: /root/save/qwen2.5-1.5b-instruct/lora/sft
template: qwen
trust_remote_code: true

### export
export_dir: /root/save/qwen2.5-1.5b-instruct_legal_lora_sft
export_size: 2
export_device: cpu  # choices: [cpu, auto]
export_legacy_format: false
```

运行命令

llamafactory-cli export examples/merge_lora/qwen2_5instruct_lora_sft.yaml

#### 启动模型

##### llamafactory (未测试)

配置文件: `LLaMA-Factory/examples/inference/qwen2_5instruct_vllm.yaml`

运行命令

llamafactory-cli api examples/inference/qwen2_5instruct_vllm.yaml

##### xinference (本次使用)

如果需要单独的模型，则需要注册-配置-启动一条龙

#### 测试微调后的结果

In [16]:
client = Client('http://localhost:9997')
# 微调模型
sft_model = client.get_model(model_uid="qwen2.5-1.5b-instruct_legal_lora_sft")

In [18]:
def process_question(q):
    completion = sft_model.chat(
        messages=[{
            'role': 'user',
            'content': prompt.format(q['question'])
        }],
        generate_config={"temperature": 0.0, "max_tokens": 2048}
    )
    answer = completion["choices"][0]["message"]["content"]
    return answer


In [19]:
result = []
# 使用 ThreadPoolExecutor 来并发处理
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_question, q) for q in data]
    for future in concurrent.futures.as_completed(futures):
        answer = future.result()
        filtered = answer.split("</think>")[-1].strip()
        result.append(filtered)
        print(filtered)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
1
0
0
5
0
1
1
0
0
0
0
1
0
1
1
1
1
2
2
0
5
2
2
2
1
1
1
1
1
0
2
2
2
3
3
3
0
2
0
2
0
0
5
4
2
2
4
2
5
1
0
1
2
4
5
2
5
5
5
5
5
5
5
5
0
5
0
5


In [20]:
# result = [int(i) for i in result]
int_result = []
for i in result:
    # 由于ds有思考过程，可能会出现输出的token过长导致被截断, 所以将产生过长答案的结果设置为 Bad answer 一类
    # 也可以在模型配置中将ds的推理部分关闭就不用做截断了
    try:
        int_result.append(int(i))
    except:
        print('Bad answer!')
        int_result.append(6)


In [21]:
target_names = ['债权债务', '建设工程', '婚姻家庭', '公司法', '合同纠纷', '劳动纠纷']

In [22]:
print(classification_report(
    y_true=references,
    y_pred=int_result,
    digits=3,
    target_names=target_names,
    zero_division=0
))

              precision    recall  f1-score   support

        债权债务      0.660     0.912     0.765        34
        建设工程      0.412     0.333     0.368        21
        婚姻家庭      0.267     0.286     0.276        14
         公司法      0.000     0.000     0.000        10
        合同纠纷      0.333     0.125     0.182         8
        劳动纠纷      0.667     0.769     0.714        13

    accuracy                          0.530       100
   macro avg      0.390     0.404     0.384       100
weighted avg      0.461     0.530     0.484       100



原始模型结果如下:
1. 每个指标都有较大的提升
2. 可以看出 accuracy 有近 10% 的提升
3. 公司法的类别还是没有分类能力

              precision    recall  f1-score   support

        债权债务      0.500     0.794     0.614        34
        建设工程      0.421     0.381     0.400        21
        婚姻家庭      0.278     0.357     0.312        14
         公司法      0.000     0.000     0.000        10
        合同纠纷      0.250     0.125     0.167         8
        劳动纠纷      0.250     0.077     0.118        13

    accuracy                          0.420       100
   macro avg      0.283     0.289     0.268       100
weighted avg      0.350     0.420     0.365       100