# 一、读取把csv训练集转换成Alpaca json格式的llm训练集

In [1]:
import pandas as pd
import json

In [2]:
df = pd.read_csv('./data/msr_paraphrase_train.tsv',sep='\t',on_bad_lines='skip')

In [3]:
df.tail()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
3933,1,1620264,1620507,"At this point , Mr. Brando announced : ' Some...","Brando said that "" somebody ought to put a bul..."
3934,0,1848001,1848224,"Martin , 58 , will be freed today after servin...",Martin served two thirds of a five-year senten...
3935,1,747160,747144,We have concluded that the outlook for price ...,"In a statement , the ECB said the outlook for ..."
3936,1,2539933,2539850,The notification was first reported Friday by ...,MSNBC.com first reported the CIA request on Fr...
3937,0,453575,453448,The 30-year bond US30YT = RR rose 22 / 32 for ...,The 30-year bond US30YT = RR grew 1-3 / 32 for...


In [4]:
# 构建一个指令
instruction="判断两个句子在语义上是否等同。如果等价，则输出 “1”；如果不等价，则输出 “0”。"

In [5]:
# 创建一个空列表来存储转换后的数据
alpaca_data = []
for index, row in df.iterrows():
    # 创建一个字典来存储当前行的数据
    data_point = {
        "instruction": instruction,
        "input": f"句子 1: {row['#1 String']}\n句子 2: {row['#2 String']}",
        "output": str(row['Quality'])
    }
    # 将字典添加到列表中
    alpaca_data.append(data_point)

# 将列表转换为JSON字符串
alpaca_json = json.dumps(alpaca_data, indent=4)

In [7]:
alpaca_data[-1]

{'instruction': '判断两个句子在语义上是否等同。如果等价，则输出 “1”；如果不等价，则输出 “0”。',
 'input': "句子 1: The 30-year bond US30YT = RR rose 22 / 32 for a yield of 4.31 percent , versus 4.35 percent at Wednesday 's close .\n句子 2: The 30-year bond US30YT = RR grew 1-3 / 32 for a yield of 4.30 percent , down from 4.35 percent late Wednesday .",
 'output': '0'}

In [8]:
#保存到文件
with open('./output/MRPC_train_data.json', 'w') as f:
    f.write(alpaca_json)

# 二 编辑llamafactory的dataset.info  
> `LLama-factory/data/dataset_info.json`
* 将/output/MRPC_train_data.json文件复制到llamafactory的data目录下
> 修改 LLaMaFactory/data/dataset_info.json
```json
{
  "MRPC_train_data":{
     "file_name": "MRPC_train_data.json"
  }, 
  "identity": {
    "file_name": "identity.json"
  ......
```
将llm_train_data.json 记录添加到配置信息中

<img src="res/1.png" alt="Alt Text" width="600" height="400">

# 三 使用Llamafacotry 微调
** 方式1 ** 通过平台提供的 Llamafactory 界面训练

<img src="res/2.jpg" alt="Alt Text" width="600" height="400">

* 进入界面后，选择模型，填写模型路径，选择训练集
  <img src="res/3.jpg" alt="Alt Text" width="800" height="400">

* 根据您的微调经验，调整相关参数，执行训练

   <img src="res/4.jpg" alt="Alt Text" width="800" height="400">

* 注意, 您需要将ui中生成的脚本复制到output/目录下的train.sh

** 方式2 ** 直接在命令行中执行
* 新建 jupyter 终端 
* 进入 `/root/dg/LLaMA-Factory` 目录
* 命令行下启动微调，例如：
```bash
llamafactory-cli train \
    --stage sft \
    --do_train True \
    --model_name_or_path /home/public/data/Model/Qwen1.5-1.8B-Chat \
    --preprocessing_num_workers 16 \
    --finetuning_type lora \
    --template qwen \
    --flash_attn auto \
    --dataset_dir data \
    --dataset MRPC_train_data \
    --cutoff_len 1024 \
    --learning_rate 5e-05 \
    --num_train_epochs 3.0 \
    --max_samples 100000 \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 8 \
    --lr_scheduler_type cosine \
    --max_grad_norm 1.0 \
    --logging_steps 5 \
    --save_steps 100 \
    --warmup_steps 0 \
    --packing False \
    --report_to none \
    --output_dir saves/Qwen1.5-1.8B-Chat/lora/test_train1 \
    --bf16 True \
    --plot_loss True \
    --ddp_timeout 180000000 \
    --optim adamw_torch \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0 \
    --lora_target all
```

# 四 使用transformers拉起模型
> 需要注意这里我们使用国产算力卡 华为的ascend910b，所以需要引入 torch_npu 库

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch_npu
from peft import PeftModel,LoraConfig,TaskType

device=torch.npu.is_available()

model_path = '/home/public/data/Model/Qwen1.5-1.8B-Chat'
lora_path = '/root/dg/LLaMA-Factory/saves/Qwen1.5-1.8B-Chat/lora/test_train1'

In [18]:
# 这里的参数要和训练界面或命令下的参数一致
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=16, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.# Dropout 比例
)

In [19]:
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 加载模型 这里我们使用的华为ascend 910B,所device_map="npu"
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="npu",torch_dtype=torch.bfloat16)
# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path, config=config)

In [20]:
device = next(model.parameters()).device
print("模型所在的device:", device)

模型所在的device: npu:0


# 读取测试集推理

In [21]:
df = pd.read_csv('./data/msr_paraphrase_test.tsv',sep='\t',on_bad_lines='skip')
df.tail()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
1634,0,2685984,2686122,"After Hughes refused to rehire Hernandez , he ...",Hernandez filed an Equal Employment Opportunit...
1635,0,339215,339172,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...
1636,0,2996850,2996734,Bethany Hamilton remained in stable condition ...,"Bethany , who remained in stable condition aft..."
1637,1,2095781,2095812,"Last week the power station ’ s US owners , AE...","The news comes after Drax 's American owner , ..."
1638,1,2136244,2136052,Sobig.F spreads when unsuspecting computer use...,The virus spreads when unsuspecting computer u...


In [22]:
from tqdm import tqdm

# 定义调用 LLM 模型的函数
def get_llm_response(row):
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content":  f"句子 1: {row['#1 String']}\n句子 2: {row['#2 String']}"}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to('npu')
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

response_records = []

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="获取 LLM 回答并更新预测"):
    id = row['#2 ID']
    response = get_llm_response(row)
    response_records.append({'ID': id, 'preQuality': response})


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
获取 LLM 回答并更新预测: 100%|██████████| 1639/1639 [07:12<00:00,  3.79it/s]


In [24]:
# 将更新后的记录写入文件
output_file = './output/test_data_llm_predictions.csv'

In [25]:
updated_df = pd.DataFrame(response_records)

In [27]:
updated_df.to_csv(output_file, index=False)