In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM,  AutoTokenizer, AutoModelForCausalLM, BertTokenizer
from transformers import DataCollatorForLanguageModeling,DataCollatorForSeq2Seq
from tqdm import tqdm
import datasets

In [2]:
tokenizer = BertTokenizer.from_pretrained("/pretrains/pt/gpt2-chinese-cluecorpussmall")
model = AutoModelForCausalLM.from_pretrained("/pretrains/pt/gpt2-chinese-cluecorpussmall")

In [3]:
dataset = datasets.load_dataset("text", data_files=["/home/qing/datasets/qidian/我师兄实在太稳健了.txt"])

Using custom data configuration default-eda27db4c7fe4dae


Downloading and preparing dataset text/default to /home/qing/.cache/huggingface/datasets/text/default-eda27db4c7fe4dae/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/qing/.cache/huggingface/datasets/text/default-eda27db4c7fe4dae/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 268604
    })
})

In [3]:
dataset = datasets.Dataset.from_list([ {'text':line.strip()} for line in open("/home/qing/datasets/qidian/我师兄实在太稳健了.txt") if line.strip() != "" ])

In [4]:
dataset['text'][0]

'《我师兄实在太稳健了》'

In [5]:
tokenizer = BertTokenizer.from_pretrained("/pretrains/pt/gpt2-chinese-cluecorpussmall")
tokenizer.tokenize(dataset['text'][0])

['《', '我', '师', '兄', '实', '在', '太', '稳', '健', '了', '》']

In [6]:
dataset.features

{'text': Value(dtype='string', id=None)}

In [7]:
def tokenization_seq2seq(sample):
    td = tokenizer(sample['text'])
    return td

p_dataset = dataset.map(tokenization_seq2seq, batched=True)
p_dataset

  0%|          | 0/135 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 134307
})

In [8]:
from copy import deepcopy
new_dataset = deepcopy(p_dataset)
new_dataset.set_format(type='torch', columns=["input_ids", "attention_mask"])

In [9]:
new_dataset[:2]

{'input_ids': [tensor([ 101,  517, 2769, 2360, 1040, 2141, 1762, 1922, 4937,  978,  749,  518,
           102]),
  tensor([ 101,  868, 5442, 8038, 6241, 2495, 3633,  837,  102])],
 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1])]}

In [16]:
from q_snippets.data import sequence_padding
class MyCollator:

    def __call__(self, features, return_tensors=None):

        input_ids = torch.tensor(sequence_padding([x['input_ids'].numpy() for x in features], length=None,force=True)).long()
        attention_mask = torch.tensor(sequence_padding([x['attention_mask'].numpy() for x in features], length=None,force=True)).long()
        # token_type_ids = torch.tensor(sequence_padding([x.token_type_ids for x in batch], length=self.max_len,force=True)).long()
        labels = deepcopy(input_ids)
        # decoder_input_ids = input_ids[:,:-1]

        return {
            'input_ids' : input_ids, 'attention_mask' : attention_mask,
            "labels":labels, 
            # 'decoder_input_ids':decoder_input_ids
        }
        
print(new_dataset[:2])
dataloader = DataLoader(new_dataset, collate_fn=MyCollator(), batch_size=4)
for i, batch in enumerate(tqdm(dataloader, total=5)):
    print([(k, v.size()) for k,v in  batch.items() ])
    if i == 5:
        break

{'input_ids': [tensor([ 101,  517, 2769, 2360, 1040, 2141, 1762, 1922, 4937,  978,  749,  518,
         102]), tensor([ 101,  868, 5442, 8038, 6241, 2495, 3633,  837,  102])], 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1])]}


100%|██████████| 5/5 [00:00<00:00, 438.11it/s]

[('input_ids', torch.Size([4, 253])), ('attention_mask', torch.Size([4, 253])), ('labels', torch.Size([4, 253]))]
[('input_ids', torch.Size([4, 48])), ('attention_mask', torch.Size([4, 48])), ('labels', torch.Size([4, 48]))]
[('input_ids', torch.Size([4, 57])), ('attention_mask', torch.Size([4, 57])), ('labels', torch.Size([4, 57]))]
[('input_ids', torch.Size([4, 52])), ('attention_mask', torch.Size([4, 52])), ('labels', torch.Size([4, 52]))]
[('input_ids', torch.Size([4, 44])), ('attention_mask', torch.Size([4, 44])), ('labels', torch.Size([4, 44]))]
[('input_ids', torch.Size([4, 19])), ('attention_mask', torch.Size([4, 19])), ('labels', torch.Size([4, 19]))]





In [11]:

device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.train().to(device)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

In [18]:
for epoch in range(3):
    for i, batch in enumerate(tqdm(dataloader, total=5)):
        if i == 20:
            break
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")

 80%|████████  | 4/5 [00:00<00:00, 15.71it/s]

loss: 5.062230110168457


14it [00:00, 17.39it/s]                      

loss: 2.7836952209472656


20it [00:01, 17.03it/s]
 80%|████████  | 4/5 [00:00<00:00, 17.22it/s]

loss: 1.2250216007232666


14it [00:00, 17.65it/s]                      

loss: 2.022953748703003


20it [00:01, 17.55it/s]
 80%|████████  | 4/5 [00:00<00:00, 17.18it/s]

loss: 1.0827327966690063


14it [00:00, 17.69it/s]                      

loss: 1.8943085670471191


20it [00:01, 17.55it/s]
