### 一、读取csv，转化成Alpaca.json格式的llm训练集

In [None]:
import pandas as pd
import json

In [None]:
df = pd.read_csv('../data/msr_paraphrase_train.tsv',sep='\t',on_bad_lines='skip')

In [None]:
# 查看最后5行数据
df.tail()

In [None]:
# 构建一个指令
instruction="判断两个句子在语义上是否等同。如果等价，则输出 “1”；如果不等价，则输出 “0”。"

In [None]:
# 创建一个空列表来存储转换后的数据
alpaca_data = []
for index, row in df.iterrows():
    # 创建一个字典来存储当前行的数据
    data_point = {
        "instruction": instruction,
        "input": f"句子 1: {row['#1 String']}\n句子 2: {row['#2 String']}",
        "output": str(row['Quality'])
    }
    # 将字典添加到列表中
    alpaca_data.append(data_point)

# 将列表转换为JSON字符串
alpaca_json = json.dumps(alpaca_data,ensure_ascii=False, indent=4)

In [None]:
alpaca_data[-1]

In [None]:
#保存到文件
with open('./MRPC_train_data.json', 'w') as f:
    f.write(alpaca_json)

### 二、llmam_factory导入数据集

> `LLama-factory/data/dataset_info.json`
* 将/output/MRPC_train_data.json文件复制到llamafactory的data目录下
> 修改 LLaMaFactory/data/dataset_info.json
```json
{
  "MRPC_train_data":{
     "file_name": "MRPC_train_data.json"
  }, 
  "identity": {
    "file_name": "identity.json"
  ......
```
将llm_train_data.json 记录添加到配置信息中

<img src="../res/1.png" alt="Alt Text" width="600" height="400">

### 三、启动llama_factory可视化界面

```
conda activate /data/lilk/yuexiang/venv

cd /data/lilk/yuexiang/LLaMA-Factory

python src/webui.py
```

### 四、启动llama_factory进行微调

** 方式1 ** 通过平台提供的 Llamafactory 界面训练

* 进入界面后，选择模型，填写模型路径，选择训练集
  <img src="../res/3.jpg" alt="Alt Text" width="800" height="400">

* 根据您的微调经验，调整相关参数，执行训练

   <img src="../res/4.jpg" alt="Alt Text" width="800" height="400">

* 注意, 您需要将ui中生成的脚本复制到output/目录下的train.sh
* 在网页中点击“预览命令”即可出现所对应的命令

** 方式2 ** 直接在命令行中执行
* 新建 jupyter 终端 
* 进入 `/root/dg/LLaMA-Factory` 目录
* 命令行下启动微调，例如：
```bash
llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path /home/public/data/Model/Qwen1.5-1.8B-Chat \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --template qwen \
    --flash_attn auto \
    --dataset_dir data \
    --dataset MRPC_train_data \
    --cutoff_len 1024 \
    --learning_rate 5e-05 \
    --num_train_epochs 3.0 \
    --max_samples 100000 \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 8 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --packing False \
    --report_to none \
    --output_dir saves/Qwen1.5-1.8B-Chat/lora/test_train1 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --optim adamw_torch \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0 \
    --lora_target all
```

### 五、transformers推理

In [None]:
# 导包
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel,LoraConfig,TaskType

In [None]:
model_path = '/data/lilk/yuexiang/Qwen1.5-1.8B-Chat'
lora_path = '/data/lilk/yuexiang/LLaMA-Factory/saves/Qwen1.5-1.8B-Chat/lora/train_2024-11-06-17-19-45'

In [None]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=16, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.# Dropout 比例
)

In [None]:
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda",torch_dtype=torch.bfloat16)
# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path, config=config)

In [None]:
device = next(model.parameters()).device
print("模型所在的device:", device)

### 六、读取测试集推理

In [None]:
df = pd.read_csv('/data/lilk/yuexiang/data/msr_paraphrase_test.tsv',sep='\t',on_bad_lines='skip')
df.tail()

In [None]:
from tqdm import tqdm

# 定义调用 LLM 模型的函数
def get_llm_response(row):
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content":  f"句子 1: {row['#1 String']}\n句子 2: {row['#2 String']}"}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to('cuda')
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

response_records = []

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="获取 LLM 回答并更新预测"):
    id = row['#2 ID']
    response = get_llm_response(row)
    response_records.append({'ID': id, 'preQuality': response})

In [None]:
# 将更新后的记录写入文件
output_file = './test_data_llm_predictions.csv'

In [None]:
updated_df = pd.DataFrame(response_records)
updated_df.to_csv(output_file, index=False)