In [1]:
import json
import torch
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

In [2]:
model_path = "/data/disk1/guohaoran/model/Qwen2.5-3B-Instruct"
device = 'cuda:7'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float64,
    device_map=device,
)
print(model)
print(tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm):

In [3]:
tag_map = {
    'human': 'user',
    'gpt': 'assistant',
    'system': 'system',
    'function_call': 'function',
    'observation': 'observation',
}

def process(row):
    data = json.loads(row['text'])
    messages = [{'role': tag_map[line['from']], 'content': line['value']} for line in data['conversations']]
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    result = tokenizer([text], return_tensors="np", padding='max_length', max_length=1024, truncation=True)
    result['attention_mask'] = result['attention_mask'][0]
    result.update({'id': data['id'], 'text': text, 'length': len(text)})
    return result

dataset = load_dataset(
    path='text',
    data_files=['/data/disk1/guohaoran/data/sharegpt_zh_38K.jsonl'],
    cache_dir='/data/disk1/guohaoran/data/.cache',
    split='train',
)
dataset = dataset.map(process)
dataset = dataset.train_test_split(test_size=0.1)

train_dataset, test_dataset = dataset['train'], dataset['test']
train_dataset = train_dataset.shuffle()

print(dataset)
item = train_dataset[0]
print(item)


Map:   0%|          | 0/38557 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'id', 'length'],
        num_rows: 34701
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'id', 'length'],
        num_rows: 3856
    })
})
{'text': '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n为Itonn沙滩排球队创作一首队歌。请记住，Itonn是innovation一词的字母组合。<|im_end|>\n<|im_start|>assistant\n创新是我们的灵感源泉\n凭借技巧，我们成为一个伟大的国家\n每个点都转化为情感\n我们对排球的热爱是我们的动力\nItonn，我们的团队强大而勇敢\n无论是沙滩还是球场，我们从不被打败\n凭借团结和斗志，我们能克服任何障碍\n我们是一个永不放弃的团队，永远不会失利\n在沙滩排球方面，我们是标杆\n凭借才华和策略，我们达到卓越\n我们是一个追求完美的团队\n每一次胜利，我们都会重新点燃我们的热情\nItonn是我们的品牌，我们的身份\n沙滩排球是我们的最大幸福\n心中充满激动\nItonn，我们的团队，是我们的热情！<|im_end|>\n<|im_start|>user\n现在创造三个加油呐喊。请记住球员的姓名是Jorge和Luccas。<|im_end|>\n<|im_start|>assistant\n1. "Jorge和Luccas，一起进攻！Itonn，我们的团队是无敌的！"\n2. "加油，Itonn！有了Jorge和Luccas的支持，没有人能阻止我们获胜的决心！"\n3. "是时候展示我们是谁的领导了，Itonn！有了Jorge和Luccas在球场上，我们是一支冠军团队！"<|im_end|>\n<|im_star

In [9]:
training_args = TrainingArguments(
    output_dir='/data/disk1/guohaoran/RhineAC/workspace/output',
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_gpu_train_batch_size=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


forward torch.Size([2, 1, 1024, 2048])


RuntimeError: shape '[2, 1, 16, 128]' is invalid for input of size 4194304