In [1]:
!pip install datasets



In [2]:
from transformers import GPT2Tokenizer

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")
tokenizer.add_special_tokens({
    "bos_token":"<s>",
    "pad_token":"<pad>",
    "eos_token":"</s>",
    "unk_token":"<unk>",
    "mask_token":"<mask>",    
})

0

In [4]:
!nvidia-smi

Fri Jan 19 18:03:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.67                 Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070      WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   54C    P0              44W / 240W |    224MiB /  8192MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
import torch
torch.cuda.is_available()

True

In [6]:
from transformers import GPT2Config
from transformers import GPT2LMHeadModel

In [7]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [8]:
model = GPT2LMHeadModel(config)

In [9]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(52000, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

### 构建数据集的两种方式

#### 方法一 较慢

In [10]:
# %%time

# from transformers import LineByLineTextDataset

# dataset = LineByLineTextDataset(
#     tokenizern = tokenizer,
#     file_path = "./sample_data.txt",
#     block_size = 128,
# )

#### 方法二

In [11]:
from datasets import load_dataset

#### 分词、索引

In [12]:
paths = ['sample_data.txt']

dataset = load_dataset('text', data_files=paths)

def encode(batch):
    return tokenizer(batch['text'], 
                     padding=True, 
                     truncation=True, 
                     max_length=512, 
                     add_special_tokens=True)
    
dataset.set_transform(encode)

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 145142
    })
})

In [14]:
dataset = dataset['train']

### 设置数据采集器

In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [16]:
!pip install transformers==4.28.0 --user



In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [18]:
trainer.train()



Step,Training Loss
500,5.4127
1000,4.5446
1500,4.2908
2000,4.1156
2500,3.9091
3000,3.7682
3500,3.6937
4000,3.5601
4500,3.5205
5000,3.418


TrainOutput(global_step=18143, training_loss=3.2308475713262563, metrics={'train_runtime': 4134.3736, 'train_samples_per_second': 35.106, 'train_steps_per_second': 4.388, 'total_flos': 1.0147961954304e+16, 'train_loss': 3.2308475713262563, 'epoch': 1.0})

In [19]:
trainer.save_model('./model_save')