In [1]:
from datasets import load_dataset
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
print(torch.cuda.current_device)


train_data_path = '/home/public/ldn/zpLLM/data/train_data.json'
eval_data_path = '/home/public/ldn/zpLLM/data/test_data.json'
# Load dataset from the hub
train_dataset = load_dataset("json", data_files=train_data_path)
eval_dataset = load_dataset("json", data_files=eval_data_path)

<function current_device at 0x7f7b14268160>


In [2]:
train_dataset,eval_dataset

(DatasetDict({
     train: Dataset({
         features: ['output', 'input', 'instruction'],
         num_rows: 7384
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['output', 'input', 'instruction'],
         num_rows: 1846
     })
 }))

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
base_model = '/home/ldn/.cache/huggingface/hub/models--baichuan-inc--baichuan-7B/snapshots/c1a5c7d5b7f50ecc51bb0e08150a9f12e5656756'
tokenizer = AutoTokenizer.from_pretrained(base_model,trust_remote_code=True)
tokenizer
# tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)

BaiChuanTokenizer(name_or_path='/home/ldn/.cache/huggingface/hub/models--baichuan-inc--baichuan-7B/snapshots/c1a5c7d5b7f50ecc51bb0e08150a9f12e5656756', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)

In [8]:
tokenizer.pad_token_id = (
        0  # unk. we want this to be different from the eos token
    )
tokenizer.padding_side = "left"  # Allow batched inference
tokenizer

BaiChuanTokenizer(name_or_path='/home/ldn/.cache/huggingface/hub/models--baichuan-inc--baichuan-7B/snapshots/c1a5c7d5b7f50ecc51bb0e08150a9f12e5656756', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)

In [9]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 7384
    })
})

In [11]:
from functools import partial
# 格式化数据集格式
def create_prompt_formats(sample):
    INTRO_BLURB = "以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。"
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
    response = f"{RESPONSE_KEY}\n{sample['output']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]
    # print(parts)
    formatted_prompt = "\n\n".join(parts)
    # print(formatted_prompt)
    sample["text"] = formatted_prompt
    return sample

# tokenize a batch
def preprocess_batch(batch,tokenizer,max_length):
    return tokenizer(
        batch['text'],
        max_length=max_length,
        truncation=True
    )

# format and tokenize 
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    print('预处理数据集...')
    # 每个样本添加提示
    dataset = dataset.map(create_prompt_formats)
    print(dataset)
    
    # 去掉每个样本的无用的列
    _preprocessing_function = partial(preprocess_batch,max_length=max_length,tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["instruction", "input", "output", "text"],
    )
    
    # 过滤掉 input_ids 超出 max_length的样本
    dataset = dataset.filter(lambda sample: len(sample['input_ids']) < max_length)
    
    # shuffle dataset
    dataset = dataset.shuffle(seed=seed)
    return dataset

In [12]:
max_length = 2048
seed = 42
train_dataset = preprocess_dataset(tokenizer,max_length,seed,train_dataset)

预处理数据集...
DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'text'],
        num_rows: 7384
    })
})


Map:   0%|          | 0/7384 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7384 [00:00<?, ? examples/s]

In [13]:
train_dataset = train_dataset['train']

In [14]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7384
})

In [16]:
print(tokenizer.decode(train_dataset[0]['input_ids']))
print(tokenizer.decode(train_dataset[1]['input_ids']))
print(tokenizer.decode(train_dataset[2]['input_ids']))

以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
哎你好老板我是前面那个骑手，去那个狂浪加油站，哎呀妈呀他是这样的他这个，呃时间有点不够你能给我加两分钟时间不呀，我说时间有点不够你能给我交两分钟时间不，噢你看你就是在那个上面b柱那个几楼几楼的上面你写写写写几个字，加上几个字就叫

### Response:
0

### End
以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
啊喂吃了没，ah，咋没有说还没吃啊，找啥原因呢，那哈点个外卖吗没吃啊，完了那做了吗给你啊，没时间点名字快点点赶紧点点个面，我刚吃饱我看你贺叔张叔救的我刚吃完饭

### Response:
0

### End
以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
你需要不需要需要需要，这里还还还有人啊，你看他比较细菌来我就是旁边有个人我不方便说我我说我，老师在吸血真厉害，给大家细节来，你的比较急什么

### Response:
0

### End


In [17]:
import torch
model = AutoModelForCausalLM.from_pretrained(base_model,
                                            device_map="auto",         
                                            load_in_8bit=True,
                                            torch_dtype=torch.float16,
                                            trust_remote_code=True)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ldn/anaconda3/envs/finetune/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /home/ldn/anaconda3/envs/finetune/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/ldn/anaconda3/envs/finetune/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


[2023-08-30 19:29:45,348] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [18]:
model

BaiChuanForCausalLM(
  (model): Model(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x DecoderLayer(
        (self_attn): Attention(
          (W_pack): Linear8bitLt(in_features=4096, out_features=12288, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): RotaryEmbedding()
        )
        (mlp): MLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=64000, bias=False)
)

In [20]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=32,
 lora_alpha=64,
 target_modules=["W_pack"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 16,777,216 || all params: 7,017,336,832 || trainable%: 0.23908238127452622


In [21]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BaiChuanForCausalLM(
      (model): Model(
        (embed_tokens): Embedding(64000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x DecoderLayer(
            (self_attn): Attention(
              (W_pack): Linear8bitLt(
                in_features=4096, out_features=12288, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=12288, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
              (rotar

In [23]:
max_length = 2048
seed = 42
test_dataset = preprocess_dataset(tokenizer,max_length,seed,eval_dataset)
test_dataset = test_dataset['train']
print(tokenizer.decode(test_dataset[1]['input_ids']))
print(tokenizer.decode(test_dataset[2]['input_ids']))
print(tokenizer.decode(test_dataset[3]['input_ids']))

预处理数据集...
DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'text'],
        num_rows: 1846
    })
})
以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
我是中国人民武装部队天水部队，要求订餐，加微信

### Response:
1

### End
以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
我是淘宝客服，为了回馈来客户要给你送一个杯子。你杯子收到货没有，你给我一个好评，并且会给你好处费15块，你添加一个企业微信

### Response:
1

### End
以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。

### Instruction:
判断一段话是否为诈骗话术，输出0或1，这段话为-->

Input:
哦，day，they，我我听着喂，我最好，我才说火山的

### Response:
0

### End


In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
import transformers
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        warmup_steps=100,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=200,
        save_steps=400,
        output_dir='/home/public/ldn/models/zp-baichuan7b-lora',
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False

trainer.train()
new_model = "/home/public/ldn/models/zp-baichuan7b-lora/zp-lora"
trainer.model.save_pretrained(new_model)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtranscenderning[0m ([33mtranscender[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668649450002702, max=1.0…



Step,Training Loss,Validation Loss
200,1.4646,1.69793
400,1.732,1.61705
600,1.4365,1.573681
800,1.6686,1.546007
1000,1.5915,1.520054
1200,1.459,1.502976
1400,1.3394,1.489926
1600,1.397,1.47773
1800,1.4093,1.471586




In [25]:
input_text = '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n你记住管他的啥呀话说完能不能行，嗯嗯嗯你你过去买哈，那个啥从那个就是原来不是从那个北门出去不是有一个广场的吗，那地方有一个一家清超市的，外婆头有个手擀面了\n\n### Response:'
model_input = tokenizer(input_text, return_tensors="pt").to("cuda")
model_input

{'input_ids': tensor([[31106,  4567, 31161, 14277,  1197,  5934, 31135, 17344,    72, 31488,
         31628, 31827,  1197, 11886, 31135,  8950,    72,  3750, 31577,  5934,
            73,     5,     5, 26411,  2081,  2768, 31143,     5, 10345, 14055,
         31455,  4531, 31178, 10910, 31455, 31538,    72, 17515,    52, 31399,
            53,    72, 20480, 31455, 31178,    90,     5,     5, 16401, 31143,
             5, 31203, 15380, 31378,  4579, 32617, 32889, 17060, 31515, 17740,
         31205,    72, 34065, 34065, 34065, 31203, 31203,  6249, 31682, 32021,
            72,  6646, 32617, 31382,  6646,  1522, 10320,  1925, 31382,  6646,
         31505, 31435,  9928,  1925,  9344,  9402, 31135, 31763,    72, 31379,
          4054,  9344, 10405, 31523, 12127, 31135,    72, 31357, 32613, 31473,
         11330, 31350, 35331, 31247, 31172,     5,     5, 26411, 16275, 31143]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [27]:
output = model.generate(**model_input, max_new_tokens=10)[0]
output
output = tokenizer.decode(output,skip_special_tokens=True)
output

'以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n你记住管他的啥呀话说完能不能行，嗯嗯嗯你你过去买哈，那个啥从那个就是原来不是从那个北门出去不是有一个广场的吗，那地方有一个一家清超市的，外婆头有个手擀面了\n\n### Response:\n0\n\n### End:\n嗯嗯'

In [28]:
output[output.find('Response')+10]

'0'

In [29]:
from torch.utils.data import Dataset
import json
class ZpData(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        with open(data_file, mode='r') as f:
            data = f.read()
        samples = json.loads(data)
        
        Data = {}
        for idx, sample in enumerate(samples):
            formatted_prompt = self.create_prompt_formats(sample)
            Data[idx] = {'text': formatted_prompt, 'label': sample['output']}
        
        return Data

    def create_prompt_formats(self, sample):
        INTRO_BLURB = "以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。"
        INSTRUCTION_KEY = "### Instruction:"
        INPUT_KEY = "Input:"
        RESPONSE_KEY = "### Response:"
        
        blurb = f"{INTRO_BLURB}"
        instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
        input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
        response = f"{RESPONSE_KEY}"
        parts = [part for part in [blurb, instruction, input_context, response] if part]
        formatted_prompt = "\n\n".join(parts)
        return formatted_prompt

    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [30]:
test_data = ZpData('/home/public/ldn/zpLLM/data/test_data.json')
print(test_data[0],test_data[1],test_data[2])


{'text': '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n你记住管他的啥呀话说完能不能行，嗯嗯嗯你你过去买哈，那个啥从那个就是原来不是从那个北门出去不是有一个广场的吗，那地方有一个一家清超市的，外婆头有个手擀面了\n\n### Response:', 'label': '0'} {'text': '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n平安银行贷款部的工作人员，有一至五万的额度可以申请，详细情况稍后会有工作人员添加你的微信给你解答\n\n### Response:', 'label': '1'} {'text': '以下是描述一个任务的指示，请编写一个适当的回答，完成该任务。\n\n### Instruction:\n判断一段话是否为诈骗话术，输出0或1，这段话为-->\n\nInput:\n抖音，我是你的小可爱，这样好为了支持中小微企业的发展京东金融特地为此类中小微企业主提供最高五十万的贷款服务您要是正好有这方面需求的话我给您介绍一下好吗，这个贷款产品呢最高有五十万的额度现在推广期内我们会给本次电话通知到的业主免费用十次天的福利这个真的是机会难得以后\n\n### Response:', 'label': '1'}


In [31]:
from torch.utils.data import DataLoader
# batch
def collote_fn(batch_samples):
    batch_sentence_1 = []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['text'])
        batch_label.append(int(sample['label']))
    X = tokenizer(
        batch_sentence_1, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

In [32]:
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=True, collate_fn=collote_fn)
size = len(test_dataloader.dataset)
size

1846

In [33]:
correct = 0

In [35]:
model.eval()
device = "cuda"
import time
with torch.no_grad():
    i = 0
    for X, y in test_dataloader:
        print(f'\n-------------------------------------batch{i+1}----------------------------------\n')
        X, y = X.to(device), y.to(device)

        start_time = time.time()
        print(f'-------------------------------------shape----------------------------------\n')
        print('\nbatch_X shape:', {k: v.shape for k, v in X.items()})
        print("\n")
        print('batch_y shape:', y.shape)
        print("\n")

        output = model.generate(**X, max_new_tokens=10)
        output = tokenizer.batch_decode(output, skip_special_tokens=True)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"\n推理时间为：{execution_time} 秒\n")

        result = []
        for str in output:
            result.append(int(str[str.find('Response')+10]))
        print("\n----------------------------------final result---------------------------------\n")
        print(f"{result}\n")
        preds = torch.tensor(result, device=device)
        print("\n----------------------------------predictions---------------------------------\n")
        print(f"{preds}\n")
        print("\n----------------------------------labels---------------------------------\n")
        print(f"{y}\n")
        print("\n----------------------------------correct---------------------------------\n")
        print(f"{(preds == y).sum().item()}\n")
        correct += (preds == y).sum().item()
        print("\n----------------------------------total correct---------------------------------\n")
        print(correct)
        i += 1
        import sys
        sys.exit(0)
        
correct /= size
print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")


-------------------------------------batch1----------------------------------

-------------------------------------shape----------------------------------


batch_X shape: {'input_ids': torch.Size([1, 69]), 'attention_mask': torch.Size([1, 69])}


batch_y shape: torch.Size([1])



推理时间为：11.778877019882202 秒


----------------------------------final result---------------------------------

[1]


----------------------------------predictions---------------------------------

tensor([1], device='cuda:0')


----------------------------------labels---------------------------------

tensor([1], device='cuda:0')


----------------------------------correct---------------------------------

1


----------------------------------total correct---------------------------------

1


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
