In [1]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [2]:
from datasets import load_dataset

datasets = load_dataset("0xDing/wikipedia-cn-20230720-filtered", split="train")

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-389m-zh")

In [4]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-389m-zh").to("cuda")

Loading weights:   0%|          | 0/294 [00:00<?, ?it/s]



In [5]:
def process_function(examples, tokenizer=tokenizer):
    contents = [v + tokenizer.eos_token for v in examples["completion"]]
    return tokenizer(contents, max_length=384, truncation=True)

In [6]:
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets.column_names)

In [7]:
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

In [8]:
from transformers import TrainingArguments

train_args = TrainingArguments(
    output_dir="./causal_lm",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    fp16=True,
    report_to=["tensorboard"],
    learning_rate=2e-5,
    warmup_steps=100
)
train_args

TrainingArguments(
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
enable_jit_checkpoint=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use_gather_object=False,

In [9]:
from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets["train"].select(range(int(len(tokenized_datasets["train"]) / 5))),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [10]:
# trainer.train()

In [11]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [12]:
pipe("西安交通大学博物馆（Xi'an Jiaotong University Museum）是一座位于西安", max_length=128, do_sample=True)

Passing `generation_config` together with generation-related arguments=({'do_sample', 'max_length'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': "西安交通大学博物馆（Xi'an Jiaotong University Museum）是一座位于西安市大唐芙蓉园内，陈列着唐宋时期的名家书画、唐代书法、唐代墓志铭和唐诗文等各种艺术珍品。西安交通大学博物馆为唐诗文类主题博物馆，展品包括唐诗、唐书、唐诗文、唐帖、唐碑、唐画、唐乐府、唐诗文集、唐诗文集整理研究等，其中《唐诗文选》、《唐诗集解》两本被文化部、中国文联授予“国家级非物质文化遗产”称号。西安交通大学博物馆有唐诗文展览厅、唐诗书论展览厅、唐诗文集展览厅、唐诗文整理研究展览厅、唐诗文书法展览厅、唐诗文绘画展览厅、唐诗文摄影展览厅、唐诗文文献资料陈列厅等。西安交通大学博物馆分三个展厅，分别是唐诗馆、唐书馆、唐诗文馆。唐诗馆为唐诗研究、鉴赏、鉴赏、鉴赏、鉴赏等各个领域的专题展览，包括唐诗作品展、唐诗鉴赏展、唐诗鉴赏研究展、唐诗文鉴赏展等。西安书论馆是书"}]

In [14]:
pipe("第一把手枪局输了的情况下第二把应该", max_length=128, do_sample=True)

Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': '第一把手枪局输了的情况下第二把应该让林小凤来打吧，林小凤在前面打起来，不过打起来没多大效果，林小凤就停了下来，然后把枪递给了他，他一看，就明白了，林小凤以前打过人，知道这把枪要杀人的，就拿过来，然后又递给了他，他说，林小凤，你快过来。林小凤马上就过来，就准备拿枪来打他，不过没想到林小凤的枪已经打进了他的体内，而且还是被林小凤的枪给打中，这个林小凤就只能死了。虽然林小凤的身体被击中了，但是林小凤还活着，他的腿也没事，但是他还是被林小凤的枪给打中了，林小凤就只能死了，不过林小凤的腿没事，所以他还是活了下来。不过林小凤还是被打了，他的腿又被打伤了，但是他还是坚持活着，不过这时候，他发现了一个小男孩，他叫小明，他就是林小凤的哥哥，小明就把他叫到自己的床下。小明看到林小凤，就上去把他抱了起来，然后小明就对林小凤'}]