In [1]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTConfig, SFTTrainer
import bitsandbytes as bnb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model_id = "Qwen2-0.5B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                             load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [3]:
ds = load_dataset("csv", data_files="data/songci.csv")


In [4]:
ds = ds.remove_columns(["id", "group_index", "author"])
ds = ds.filter(lambda row: row["dynasty"] == "宋" and row["type"] == "詞")

In [5]:
def format_chat_template(row):
  messages = [
      {
          "role": "system",
          "content": "你是一个宋词专家，根据用户的词牌名和其他要求作词",
      },
      {"role": "user", "content": row["title"]},
      {"role": "assistant", "content": row["content"]},
  ]

  row["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
  return row

ds = ds.map(format_chat_template)
ds['train'][:3]

{'title': ['蝶恋花', '桂殿秋', '好事近'],
 'type': ['詞', '詞', '詞'],
 'dynasty': ['宋', '宋', '宋'],
 'content': ['团扇题诗春又晚。小梦惊残，碧草池塘满。一曲银钩帘半捲。绿窗睡足莺声软。瘦损衣围罗带减。前度风流，陡觉心情懒。谁品新腔拈翠管。画楼吹彻江南怨。',
  '青帝子，碧莲宫。不驾云车骑白龙。瑶池路远羽衣湿，玉珮泠泠明月中。',
  '花底一声莺，花上半钩斜月。月落乌啼何处，点飞英如雪。东风吹尽去年愁，解放丁香结。惊动小亭红雨，舞双双金蝶。'],
 'text': ['<|im_start|>system\n你是一个宋词专家，根据用户的词牌名和其他要求作词<|im_end|>\n<|im_start|>user\n蝶恋花<|im_end|>\n<|im_start|>assistant\n团扇题诗春又晚。小梦惊残，碧草池塘满。一曲银钩帘半捲。绿窗睡足莺声软。瘦损衣围罗带减。前度风流，陡觉心情懒。谁品新腔拈翠管。画楼吹彻江南怨。<|im_end|>\n',
  '<|im_start|>system\n你是一个宋词专家，根据用户的词牌名和其他要求作词<|im_end|>\n<|im_start|>user\n桂殿秋<|im_end|>\n<|im_start|>assistant\n青帝子，碧莲宫。不驾云车骑白龙。瑶池路远羽衣湿，玉珮泠泠明月中。<|im_end|>\n',
  '<|im_start|>system\n你是一个宋词专家，根据用户的词牌名和其他要求作词<|im_end|>\n<|im_start|>user\n好事近<|im_end|>\n<|im_start|>assistant\n花底一声莺，花上半钩斜月。月落乌啼何处，点飞英如雪。东风吹尽去年愁，解放丁香结。惊动小亭红雨，舞双双金蝶。<|im_end|>\n']}

In [6]:
ds = ds.shuffle(seed=42)

In [7]:
dataset = ds['train'].select(range(2000)).train_test_split(test_size=0.1)

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'type', 'dynasty', 'content', 'text'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['title', 'type', 'dynasty', 'content', 'text'],
        num_rows: 200
    })
})


In [9]:
config = LoraConfig(
    r=2,
    lora_alpha=4,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    bias="none",
)

model = get_peft_model(model, config)

In [10]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 1099776 | total: 495132544 | Percentage: 0.2221%


In [11]:
sft_config = SFTConfig(
    output_dir="/model",
    dataset_batch_size=2,
    dataset_text_field="text",
    warmup_steps=10,
    max_seq_length=256,
    # max_steps=100,
    bf16=True,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=10,
    packing=True,
)

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=config,
    args=sft_config,
)


Generating train split: 776 examples [00:00, 2208.72 examples/s]
Generating train split: 89 examples [00:00, 2628.64 examples/s]


In [None]:
trainer.train()

In [14]:
trainer.evaluate()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 12/12 [00:10<00:00,  1.15it/s]


{'eval_loss': 3.6573894023895264,
 'eval_runtime': 10.6001,
 'eval_samples_per_second': 8.396,
 'eval_steps_per_second': 1.132,
 'epoch': 1.0}

In [21]:
messages = [
    {
        "role": "system",
        "content": "你是一个宋词专家，根据用户的词牌名和其他要求作词"
    },
    {
        "role": "user",
        "content": "水龙吟"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=False, 
                   truncation=True).to("cuda")
# model = AutoModelForCausalLM.from_pretrained("Qwen2-0.5B-Instruct").to(device)

outputs = model.generate(**inputs, max_new_tokens=1024, num_beams=5, do_sample=True, early_stopping=True,
                         num_return_sequences=1, repetition_penalty=2.0)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


风定云平，月明如水，夜深人静。玉箫声细，金鼓声长，谁与我共听此曲。酒醒无梦，花醉无愁，只待春归去。试问闲愁何事，不似旧时欢会。但愿君心似我情，莫相猜疑。


In [22]:
print(text)

system
你是一个宋词专家，根据用户的词牌名和其他要求作词
user
水龙吟
assistant
风定云平，月明如水，夜深人静。玉箫声细，金鼓声长，谁与我共听此曲。酒醒无梦，花醉无愁，只待春归去。试问闲愁何事，不似旧时欢会。但愿君心似我情，莫相猜疑。
