In [10]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTConfig, SFTTrainer
import bitsandbytes as bnb
import os

In [11]:
base_model_id = "Qwen2-0.5B-bnb-4bit"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(base_model_id)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
ds = load_dataset("csv", data_files="data/songci.csv")


In [13]:
ds = ds.remove_columns(["id", "group_index", "author"])
ds = ds.filter(lambda row: row["dynasty"] == "宋" and row["type"] == "詞")

In [14]:
def format_chat_template(row):
  messages = [
      {
          "role": "system",
          "content": "你是一个宋词专家，根据用户的词牌名和其他要求作词",
      },
      {"role": "user", "content": row["title"]},
      {"role": "assistant", "content": row["content"]},
  ]

  row["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
  return row

ds = ds.map(format_chat_template)
ds['train'][:3]

{'title': ['蝶恋花', '桂殿秋', '好事近'],
 'type': ['詞', '詞', '詞'],
 'dynasty': ['宋', '宋', '宋'],
 'content': ['团扇题诗春又晚。小梦惊残，碧草池塘满。一曲银钩帘半捲。绿窗睡足莺声软。瘦损衣围罗带减。前度风流，陡觉心情懒。谁品新腔拈翠管。画楼吹彻江南怨。',
  '青帝子，碧莲宫。不驾云车骑白龙。瑶池路远羽衣湿，玉珮泠泠明月中。',
  '花底一声莺，花上半钩斜月。月落乌啼何处，点飞英如雪。东风吹尽去年愁，解放丁香结。惊动小亭红雨，舞双双金蝶。'],
 'text': ['<|im_start|>system\n你是一个宋词专家，根据用户的词牌名和其他要求作词<|im_end|>\n<|im_start|>user\n蝶恋花<|im_end|>\n<|im_start|>assistant\n团扇题诗春又晚。小梦惊残，碧草池塘满。一曲银钩帘半捲。绿窗睡足莺声软。瘦损衣围罗带减。前度风流，陡觉心情懒。谁品新腔拈翠管。画楼吹彻江南怨。<|im_end|>\n',
  '<|im_start|>system\n你是一个宋词专家，根据用户的词牌名和其他要求作词<|im_end|>\n<|im_start|>user\n桂殿秋<|im_end|>\n<|im_start|>assistant\n青帝子，碧莲宫。不驾云车骑白龙。瑶池路远羽衣湿，玉珮泠泠明月中。<|im_end|>\n',
  '<|im_start|>system\n你是一个宋词专家，根据用户的词牌名和其他要求作词<|im_end|>\n<|im_start|>user\n好事近<|im_end|>\n<|im_start|>assistant\n花底一声莺，花上半钩斜月。月落乌啼何处，点飞英如雪。东风吹尽去年愁，解放丁香结。惊动小亭红雨，舞双双金蝶。<|im_end|>\n']}

In [15]:
ds = ds.shuffle(seed=42)

In [16]:
dataset = ds['train'].select(range(2000)).train_test_split(test_size=0.1)

In [17]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'type', 'dynasty', 'content', 'text'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['title', 'type', 'dynasty', 'content', 'text'],
        num_rows: 200
    })
})


In [18]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    bias="none",
)

model = get_peft_model(model, config)

In [19]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 4399104 | total: 498431872 | Percentage: 0.8826%


In [20]:
sft_config = SFTConfig(
    output_dir="/model",
    dataset_batch_size=1,
    dataset_text_field="text",
    warmup_steps=10,
    # max_steps=100,
    bf16=True,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=10,
    # packing=True,
)

In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=config,
    max_seq_length=256,
    args=sft_config,
    dataset_batch_size=1,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [22]:
trainer.train()

  0%|          | 0/225 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 4.7995, 'learning_rate': 0.0002, 'epoch': 0.04}
{'loss': 3.618, 'learning_rate': 0.00019069767441860466, 'epoch': 0.09}
{'loss': 3.5571, 'learning_rate': 0.0001813953488372093, 'epoch': 0.13}
{'loss': 3.6179, 'learning_rate': 0.00017209302325581395, 'epoch': 0.18}
{'loss': 3.5592, 'learning_rate': 0.00016279069767441862, 'epoch': 0.22}
{'loss': 3.5199, 'learning_rate': 0.00015348837209302327, 'epoch': 0.27}
{'loss': 3.3457, 'learning_rate': 0.00014418604651162791, 'epoch': 0.31}
{'loss': 3.5625, 'learning_rate': 0.00013488372093023256, 'epoch': 0.36}
{'loss': 3.4644, 'learning_rate': 0.0001255813953488372, 'epoch': 0.4}
{'loss': 3.449, 'learning_rate': 0.00011627906976744187, 'epoch': 0.44}
{'loss': 3.3784, 'learning_rate': 0.00010697674418604651, 'epoch': 0.49}
{'loss': 3.359, 'learning_rate': 9.767441860465116e-05, 'epoch': 0.53}
{'loss': 3.5637, 'learning_rate': 8.837209302325582e-05, 'epoch': 0.58}
{'loss': 3.4526, 'learning_rate': 7.906976744186047e-05, 'epoch': 0.62}
{'l

TrainOutput(global_step=225, training_loss=3.5255275302463107, metrics={'train_runtime': 714.6091, 'train_samples_per_second': 2.519, 'train_steps_per_second': 0.315, 'train_loss': 3.5255275302463107, 'epoch': 1.0})

In [23]:
trainer.evaluate()

  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 3.4121129512786865,
 'eval_runtime': 15.8034,
 'eval_samples_per_second': 12.655,
 'eval_steps_per_second': 1.582,
 'epoch': 1.0}

In [29]:
messages = [
    {
        "role": "system",
        "content": "你是一个宋词专家，根据用户的词牌名和其他要求作词"
    },
    {
        "role": "user",
        "content": "水龙吟"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=False, 
                   truncation=True).to("cuda")
# model = AutoModelForCausalLM.from_pretrained("Qwen2-0.5B-Instruct").to(device)

outputs = model.generate(**inputs, max_new_tokens=64, num_beams=5, do_sample=True, early_stopping=True,
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



风露初晴，风月初晴，初晴初雨。风月初晴，初晴初雨。风月初晴，初晴初雨。风月初晴，初晴初雨。风月初晴，初晴初雨。风月初晴，初晴初雨


In [27]:
print(text)

system
你是一个宋词专家，根据用户的词牌名和其他要求作词
user
水龙吟
assistant
月色如水，花阴似雪，帘幕轻寒。梦里梦中，一枕无眠。倚阑干，倚阑干。人归无寐，人归无寐。帘幕轻寒。帘幕轻寒。帘幕轻寒。帘幕轻寒。帘
