In [1]:
from transformers import AutoTokenizer

#加载编码器
tokenizer = AutoTokenizer.from_pretrained('gpt2')

#添加pad
#tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})

print(tokenizer)

#编码试算
tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'contains no wit , only labored gags'
])

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})


{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [3642, 1299, 645, 20868, 837, 691, 2248, 1850, 308, 3775]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [2]:
from datasets import load_dataset, load_from_disk, concatenate_datasets

#加载数据
#dataset = load_dataset('imdb')
#dataset.save_to_disk('datas/imdb')
dataset = load_from_disk('datas/imdb')

#重新切分数据集
dataset = concatenate_datasets(
    [dataset['train'], dataset['test'], dataset['unsupervised']])

dataset = dataset.train_test_split(test_size=0.01, seed=0)

#采样,数据量太大了跑不动
dataset['train'] = dataset['train'].shuffle(0).select(range(80000))
dataset['test'] = dataset['test'].shuffle(0).select(range(200))


#分词
def f(data):
    #移除<br/>
    for i in range(len(data['text'])):
        data['text'][i] = data['text'][i].replace('<br /><br />', ' ')

    data = tokenizer.batch_encode_plus(data['text'])

    return data


dataset = dataset.map(f,
                      batched=True,
                      num_proc=4,
                      batch_size=1000,
                      remove_columns=['text', 'label'])


#过滤掉太短的句子
def f(data):
    return [sum(i) >= 25 for i in data['attention_mask']]


dataset = dataset.filter(f, batched=True, num_proc=4, batch_size=1000)


#拼合句子到统一的长度
def f(data):
    block_size = 512

    #展平数据
    input_ids = []
    for i in data['input_ids']:
        input_ids.extend(i)

    #切断数据
    data = {'input_ids': [], 'attention_mask': []}
    for i in range(len(input_ids) // block_size):
        block = input_ids[i * block_size:i * block_size + block_size]
        data['input_ids'].append(block)
        data['attention_mask'].append([1] * block_size)

    #设置labels
    data['labels'] = data['input_ids'].copy()

    return data


dataset = dataset.map(
    f,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

dataset

Loading cached split indices for dataset at datas/imdb/train/cache-1d535447c1465b6c.arrow and datas/imdb/train/cache-e90a494c3d555261.arrow
Loading cached shuffled indices for dataset at datas/imdb/train/cache-d0103816d7176d46.arrow
Loading cached shuffled indices for dataset at datas/imdb/train/cache-eb5685c86456ecec.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-2a96f14a7578e489.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-067837a2d816bf82.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-af81cf006a6a947e.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-296d547c05833599.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-d8c36e759992d45e.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-0901fa7715bba813.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-0366ef2f735087b2.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-c9f0dfc46457adf9.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-acd1bb5d6ba45c71_00000_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-acd1bb5d6ba45c71_00001_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-acd1bb5d6ba45c71_00002_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-acd1bb5d6ba45c71_00003_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-dc8b9e8601cc8455_00000_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-dc8b9e8601cc8455_00001_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-dc8b9e8601cc8455_00002_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-dc8b9e8601cc8455_00003_of_00004.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-9906f3cd542c0fa2.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-841d891b93b4bc07.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-618be455982094aa.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-c521da6316862744.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-ed91b24e3add7132.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-cc6fbee431f24db2.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-5db45ac2efda8395.arrow


 

Loading cached processed dataset at datas/imdb/train/cache-6e280417b8978311.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 44863
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 107
    })
})

In [3]:
import torch
from transformers.data.data_collator import default_data_collator

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=default_data_collator,
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

len(loader), data

(5607,
 {'input_ids': tensor([[   11,   351,   393,  ..., 28177,   477,   286],
          [  845,   880,  5600,  ...,    11,   339, 38383],
          [ 5967,   326,   617,  ...,  4930,  2753,   257],
          ...,
          [ 3621,   284,   766,  ...,  8876,  8165,   284],
          [  318,  6994,   337,  ...,  1577,   329,   777],
          [  319,  8829,   810,  ...,   632,  5788,   617]]),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]]),
  'labels': tensor([[   11,   351,   393,  ..., 28177,   477,   286],
          [  845,   880,  5600,  ...,    11,   339, 38383],
          [ 5967,   326,   617,  ...,  4930,  2753,   257],
          ...,
          [ 3621,   284,   766,  ...,  8876,  8165,   284],
          [  318,  6994,   337,  ...,  1577,   329,   777],
          [  319,  8829,   

In [4]:
from transformers import AutoModelForCausalLM, GPT2Model

#加载模型
#model = AutoModelForCausalLM.from_pretrained('gpt2')


#定义下游任务模型
class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.pretrained = GPT2Model.from_pretrained('gpt2')
        self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)

        #加载预训练模型的参数
        parameters = AutoModelForCausalLM.from_pretrained('gpt2')
        self.fc.load_state_dict(parameters.lm_head.state_dict())

        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels):
        logits = self.pretrained(input_ids=input_ids,
                                 attention_mask=attention_mask)
        logits = logits.last_hidden_state

        logits = self.fc(logits)

        shift_logits = logits[:, :-1].flatten(end_dim=1)
        shift_labels = labels[:, 1:].flatten()

        loss = self.criterion(shift_logits, shift_labels)

        return {
            'loss': loss,
            'logits': logits,
        }


model = Model()

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

with torch.no_grad():
    out = model(**data)

out['loss'], out['logits'].shape

16303.7184


(tensor(3.8249), torch.Size([8, 512, 50257]))

In [5]:
def generate(text):

    def generate_loop(data):
        with torch.no_grad():
            out = model(**data)

        #取最后一个字
        #[5, b, 50257]
        out = out['logits']
        #[5, 50257]
        out = out[:, -1]

        #第50大的值,以此为分界线,小于该值的全部赋值为负无穷
        #[5, 50257] -> [5, 50]
        topk_value = torch.topk(out, 50).values
        #[5, 50] -> [5] -> [5, 1]
        topk_value = topk_value[:, -1].unsqueeze(dim=1)

        #赋值
        #[5, 50257]
        out = out.masked_fill(out < topk_value, -float('inf'))

        #根据概率采样,无放回,所以不可能重复
        #[5, 50257] -> [5, 1]
        out = out.softmax(dim=1)
        out = out.multinomial(num_samples=1)

        data['input_ids'] = torch.cat([data['input_ids'], out], dim=1)
        data['attention_mask'] = torch.ones_like(data['input_ids'])
        data['labels'] = data['input_ids'].clone()

        if data['input_ids'].shape[1] >= 30:
            return data

        return generate_loop(data)

    #重复5遍
    data = tokenizer.batch_encode_plus([text] * 5, return_tensors='pt')
    data['labels'] = data['input_ids'].clone()

    data = generate_loop(data)

    for i in range(5):
        print(i, tokenizer.decode(data['input_ids'][i]))


generate('I love this')

0 I love this guy and I'm really into him for sure."

He was not available for comment.<|endoftext|>The US Department of Justice has issued
1 I love this book! I will definitely be reading more!<|endoftext|>Here are links to video games you may have heard about in the past week or so
2 I love this comic so much. Every action is hilarious and fun and a great way of showing off your awesome work. This may change over time and
3 I love this game and can't wait to try this out."

With these two names on the market next month, it makes sense to see
4 I love this book and I can finally read every page of it on my Kindle!"<|endoftext|>This item is no longer available. You will be receiving this


In [6]:
from transformers import AdamW
from transformers.optimization import get_scheduler


#训练
def train():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    model.train()
    for i, data in enumerate(loader):
        for k in data.keys():
            data[k] = data[k].to(device)
        out = model(**data)
        loss = out['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 50 == 0:
            labels = data['labels'][:, 1:]
            out = out['logits'].argmax(dim=2)[:, :-1]

            accuracy = (labels == out).sum().item() / labels.numel()

            lr = optimizer.state_dict()['param_groups'][0]['lr']

            print(i, loss.item(), lr, accuracy)

    model = model.to('cpu')
    torch.save(model, 'models/13.生成.model')


#train()



0 3.95790696144104 1.9996433030140896e-05 0.29696673189823874
50 4.074559688568115 1.981808453718566e-05 0.2967221135029354
100 3.8087832927703857 1.9639736044230427e-05 0.3175146771037182
150 3.982102155685425 1.9461387551275193e-05 0.30968688845401177
200 3.8035480976104736 1.9283039058319958e-05 0.32093933463796476
250 3.858929395675659 1.9104690565364724e-05 0.31237769080234834
300 3.8749477863311768 1.892634207240949e-05 0.300880626223092
350 3.9110424518585205 1.8747993579454255e-05 0.299412915851272
400 3.6504409313201904 1.856964508649902e-05 0.336839530332681
450 3.7060563564300537 1.8391296593543786e-05 0.3182485322896282
500 3.8308796882629395 1.821294810058855e-05 0.31898238747553814
550 3.9164681434631348 1.8034599607633317e-05 0.30552837573385516
600 3.844902992248535 1.7856251114678083e-05 0.3148238747553816
650 3.864423990249634 1.767790262172285e-05 0.31947162426614484
700 3.9106669425964355 1.7499554128767614e-05 0.3084637964774951
750 3.906909227371216 1.732120563581

In [7]:
model = torch.load('models/13.生成.model')
generate('I love this')

0 I love this movie, I love it. It's about a guy in a small town, trying to be a good father(...I mean,
1 I love this film; I even enjoyed it in the 80's! However, I can't get too much into it.This is truly a great
2 I love this movie. Some movies with bad actors and actresses would have more of a reason but The Dictator is one of the best documentaries I
3 I love this is one of them!! I even wrote my Oscar for this film when it was released in 1988. Great documentary to watch, especially when
4 I love this movie but I think it is rubbish. There are only ten people worth watching who are not good but these people are as annoying as the
