In [57]:
from datasets import load_dataset


train_data_path = '/home/public/ldn/zpLLM/data/train_data.json'
eval_data_path = '/home/public/ldn/zpLLM/data/test_data.json'
# Load dataset from the hub
train_dataset = load_dataset("json", data_files=train_data_path)
eval_dataset = load_dataset("json", data_files=eval_data_path)

In [58]:
train_dataset,eval_dataset

(DatasetDict({
     train: Dataset({
         features: ['output', 'input', 'instruction'],
         num_rows: 7384
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['output', 'input', 'instruction'],
         num_rows: 1846
     })
 }))

In [59]:
from transformers import LlamaForCausalLM, LlamaTokenizer
base_model = '/home/ldn/.cache/huggingface/hub/models--lmsys--vicuna-7b-v1.3/snapshots/ac066c83424c4a7221aa10c0ebe074b24d3bcdb6'
tokenizer = LlamaTokenizer.from_pretrained(base_model)

In [60]:
tokenizer
tokenizer.pad_token_id = (
        0  # unk. we want this to be different from the eos token
    )
tokenizer.padding_side = "left"  # Allow batched inference

In [61]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 7384
    })
})

In [62]:
from functools import partial
# 格式化数据集格式
def create_prompt_formats(sample):
    INTRO_BLURB = "以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。"
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
    response = f"{RESPONSE_KEY}\n{sample['output']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]
    # print(parts)
    formatted_prompt = "\n\n".join(parts)
    # print(formatted_prompt)
    sample["text"] = formatted_prompt
    return sample

# tokenize a batch
def preprocess_batch(batch,tokenizer,max_length):
    return tokenizer(
        batch['text'],
        max_length=max_length,
        truncation=True
    )

# format and tokenize 
def preprocess_dataset(tokenizer: LlamaTokenizer, max_length: int, seed, dataset: str):
    print('预处理数据集...')
    # 每个样本添加提示
    dataset = dataset.map(create_prompt_formats)
    print(dataset)
    
    # 去掉每个样本的无用的列
    _preprocessing_function = partial(preprocess_batch,max_length=max_length,tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["instruction", "input", "output", "text"],
    )
    
    # 过滤掉 input_ids 超出 max_length的样本
    dataset = dataset.filter(lambda sample: len(sample['input_ids']) < max_length)
    
    # shuffle dataset
    dataset = dataset.shuffle(seed=seed)
    return dataset

In [63]:
max_length = 2048
seed = 42
train_dataset = preprocess_dataset(tokenizer,max_length,seed,train_dataset)

预处理数据集...
DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'text'],
        num_rows: 7384
    })
})


Map:   0%|          | 0/7384 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7384 [00:00<?, ? examples/s]

In [64]:
train_dataset = train_dataset['train']

In [65]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7384
})

In [66]:
tokenizer.decode(train_dataset[1]['input_ids'])

'<s>以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n啊喂吃了没，ah，咋没有说还没吃啊，找啥原因呢，那哈点个外卖吗没吃啊，完了那做了吗给你啊，没时间点名字快点点赶紧点点个面，我刚吃饱我看你贺叔张叔救的我刚吃完饭\n\n### Response:\n0\n\n### End'

In [67]:
import torch
model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [68]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSN

In [69]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=32,
 lora_alpha=64,
 target_modules=["q_proj","v_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 16,777,216 || all params: 6,755,192,832 || trainable%: 0.24836028248556738


In [70]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          

In [72]:
max_length = 2048
seed = 42
test_dataset = preprocess_dataset(tokenizer,max_length,seed,eval_dataset)
test_dataset = test_dataset['train']
print(tokenizer.decode(test_dataset[1]['input_ids']))


预处理数据集...


Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'text'],
        num_rows: 1846
    })
})


Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1846 [00:00<?, ? examples/s]

<s>以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
我是中国人民武装部队天水部队，要求订餐，加微信

### Response:
1

### End


In [77]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
import transformers
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=200,
        save_steps=200,
        output_dir='./output',
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False


trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtranscenderning[0m ([33mtranscender[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669461069007716, max=1.0…



Step,Training Loss,Validation Loss
200,0.9774,1.318475
400,1.096,1.250797
600,0.9387,1.209077
800,1.0649,1.182307
1000,1.0221,1.159278
1200,0.905,1.143131
1400,0.849,1.128625
1600,0.8525,1.115545
1800,0.8784,1.104801
2000,0.7984,1.098012




TrainOutput(global_step=5538, training_loss=0.8737396842609376, metrics={'train_runtime': 125772.1015, 'train_samples_per_second': 0.176, 'train_steps_per_second': 0.044, 'total_flos': 1.7020595013901517e+17, 'train_loss': 0.8737396842609376, 'epoch': 3.0})

In [79]:
input_text = '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n你记住管他的啥呀话说完能不能行，嗯嗯嗯你你过去买哈，那个啥从那个就是原来不是从那个北门出去不是有一个广场的吗，那地方有一个一家清超市的，外婆头有个手擀面了\n\n### Response:'
model_input = tokenizer(input_text, return_tensors="pt").to("cuda")
model_input

{'input_ids': tensor([[    1, 29871, 30651, 30557, 30392,   233,   146,   146,   235,   194,
           179, 30287, 30502, 31450, 31358, 30210, 31084, 30858, 30214, 31088,
         31795, 31479, 30287, 30502,   236,   131,   133, 30948, 30210, 30742,
           234,   176,   151, 30214, 31366, 30494, 31751, 31450, 31358, 30267,
            13,    13,  2277, 29937,  2799,  4080, 29901,    13, 31791, 31683,
         30287, 31559, 31852, 30392, 31191, 30573,   235,   178,   139,   236,
           173,   154, 31852,   233,   159,   178, 30214, 31573, 30544, 29900,
         31391, 29896, 30214, 30810, 31559, 31852, 30573, 15110,    13,    13,
          4290, 29901,    13, 30919, 31410,   231,   192,   146, 31624, 31221,
         30210,   232,   152,   168,   232,   148,   131, 31852, 31639, 31366,
         30815, 30413, 30815, 30448, 30214,   232,   154,   178,   232,   154,
           178,   232,   154,   178, 30919, 30919, 31138, 31475,   231,   188,
           179,   232,   150,   139, 3

In [84]:
output = model.generate(**model_input, max_new_tokens=100)[0]
output
output = tokenizer.decode(output,skip_special_tokens=True)
output

'以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n你记住管他的啥呀话说完能不能行，嗯嗯嗯你你过去买哈，那个啥从那个就是原来不是从那个北门出去不是有一个广场的吗，那地方有一个一家清超市的，外婆头有个手擀面了\n\n### Response:\n0\n\n### End_of_task\n\n### End_of_instruction\n\n### End_of_input\n\n### End_of_output\n\n### End_of_task\n\n### End_of_instruction\n\n### End_of_input\n\n### End_of_output\n\n### End_of_task\n\n### End_of_instruction\n\n### End'

In [85]:
output[output.find('Response')+10]

'0'

In [114]:
from torch.utils.data import Dataset
import json
class ZpData(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        with open(data_file, mode='r') as f:
            data = f.read()
        samples = json.loads(data)
        
        Data = {}
        for idx, sample in enumerate(samples):
            formatted_prompt = self.create_prompt_formats(sample)
            Data[idx] = {'text': formatted_prompt, 'label': sample['output']}
        
        return Data

    def create_prompt_formats(self, sample):
        INTRO_BLURB = "以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。"
        INSTRUCTION_KEY = "### Instruction:"
        INPUT_KEY = "Input:"
        RESPONSE_KEY = "### Response:"
        
        blurb = f"{INTRO_BLURB}"
        instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
        input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
        response = f"{RESPONSE_KEY}"
        parts = [part for part in [blurb, instruction, input_context, response] if part]
        formatted_prompt = "\n\n".join(parts)
        return formatted_prompt

    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [121]:
test_data = ZpData('/home/public/ldn/zpLLM/data/test_data.json')
print(test_data[0],test_data[1],test_data[2])


{'text': '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n你记住管他的啥呀话说完能不能行，嗯嗯嗯你你过去买哈，那个啥从那个就是原来不是从那个北门出去不是有一个广场的吗，那地方有一个一家清超市的，外婆头有个手擀面了\n\n### Response:', 'label': '0'} {'text': '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n平安银行贷款部的工作人员，有一至五万的额度可以申请，详细情况稍后会有工作人员添加你的微信给你解答\n\n### Response:', 'label': '1'} {'text': '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n抖音，我是你的小可爱，这样好为了支持中小微企业的发展京东金融特地为此类中小微企业主提供最高五十万的贷款服务您要是正好有这方面需求的话我给您介绍一下好吗，这个贷款产品呢最高有五十万的额度现在推广期内我们会给本次电话通知到的业主免费用十次天的福利这个真的是机会难得以后\n\n### Response:', 'label': '1'}


In [122]:
from torch.utils.data import DataLoader
# batch
def collote_fn(batch_samples):
    batch_sentence_1 = []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['text'])
        batch_label.append(int(sample['label']))
    X = tokenizer(
        batch_sentence_1, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

In [148]:
test_dataloader = DataLoader(test_data, batch_size=128, shuffle=True, collate_fn=collote_fn)
size = len(test_dataloader.dataset)
size

1846

In [149]:
correct = 0

In [150]:
model.eval()
device = "cuda:1"
import time
with torch.no_grad():
    i = 0
    for X, y in test_dataloader:
        print(f'\n-------------------------------------batch{i+1}----------------------------------\n')
        X, y = X.to(device), y.to(device)

        start_time = time.time()
        print(f'-------------------------------------shape----------------------------------\n')
        print('\nbatch_X shape:', {k: v.shape for k, v in X.items()})
        print("\n")
        print('batch_y shape:', y.shape)
        print("\n")

        output = model.generate(**X, max_new_tokens=100)
        output = tokenizer.batch_decode(output, skip_special_tokens=True)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"\n推理时间为：{execution_time} 秒\n")

        result = []
        for str in output:
            result.append(int(str[str.find('Response')+10]))
        print("\n----------------------------------final result---------------------------------\n")
        print(f"{result}\n")
        preds = torch.tensor(result, device=device)
        print("\n----------------------------------predictions---------------------------------\n")
        print(f"{preds}\n")
        print("\n----------------------------------labels---------------------------------\n")
        print(f"{y}\n")
        print("\n----------------------------------correct---------------------------------\n")
        print(f"{(preds == y).sum().item()}\n")
        correct += (preds == y).sum().item()
        print("\n----------------------------------total correct---------------------------------\n")
        print(correct)
        i += 1
        
correct /= size
print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")


-------------------------------------batch1----------------------------------

-------------------------------------shape----------------------------------


batch_X shape: {'input_ids': torch.Size([128, 417]), 'attention_mask': torch.Size([128, 417])}


batch_y shape: torch.Size([128])



推理时间为：8502.98522734642 秒


----------------------------------final result---------------------------------

[1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0]


----------------------------------predictions---------------------------------

tensor([1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 

KeyboardInterrupt: 