In [None]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token
# set our pad token to be the eos token. This lets gpt know how to fill space

In [None]:
# load up our data into a dataset
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/Book.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=64,  # length of each chunk of text to use as a datapoint
)



In [None]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([   44,  7156,  1404, 25994,   198,    44,  7156,  1404, 25994,   198,
            51, 25994,    45, 43781,  3268,   362,    46,    20,    46,   198,
            51, 25994,    45, 43781,  3268, 32215,   198, 42131,   416,   198,
            35, 43664,  3698,  8782, 15154, 34509, 42131,   416,   198,    35,
         43664,  3698,  8782, 15154, 34509,   198, 30650,   198,   198, 24492,
           739,  8568, 17098,   422,   383, 36619,   416,   198, 37046, 13661,
         12052,   198,    18,  6479]),
 torch.Size([64]))

In [None]:
print(tokenizer.decode(pds_data[0]))

MEGATECH
MEGATECH
TECHNOLOGY IN 2O5O
TECHNOLOGY IN 2050
edited by
DANIEL FRANKLINedited by
DANIEL FRANKLIN
Books

Published under exclusive licence from The Economist by
Profile Books Ltd
3 Hol


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)
)

In [None]:
# example of how collator pads data dynamically
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [None]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [None]:
tokenizer.pad_token_id

50256

In [None]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(  # create a generator with built in params
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
print('----------')
for generated_sequence in pretrained_generator('what is the text about', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
what is the text about?

I am sorry, sir. Is this your first question?
If he did not have anything to do with me or with the world -
could you explain the whole process of creation?
Would it make
----------
what is the text about the problem?
In this essay the author argues that:
The economic crisis in the world today has become part of some kind of world of globalization, to which neoliberalism can relate (from the perspective of human capital).
----------
what is the text about the book?)
This is what is called a literary analysis – and it involves a few key assumptions.
You don't know what you're getting at once, because nothing is happening for thousands of pages
A new book
----------


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()



{'eval_loss': 3.7811810970306396,
 'eval_model_preparation_time': 0.0082,
 'eval_runtime': 0.1849,
 'eval_samples_per_second': 135.214,
 'eval_steps_per_second': 5.409}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,No log,3.708098,0.0082
2,No log,3.672349,0.0082
3,3.595000,3.668317,0.0082


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=12, training_loss=3.5815111796061196, metrics={'train_runtime': 40.4744, 'train_samples_per_second': 7.19, 'train_steps_per_second': 0.296, 'total_flos': 9504497664000.0, 'train_loss': 3.5815111796061196, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 3.6683168411254883,
 'eval_model_preparation_time': 0.0082,
 'eval_runtime': 0.1712,
 'eval_samples_per_second': 146.01,
 'eval_steps_per_second': 5.84,
 'epoch': 3.0}

In [None]:
trainer.save_model()

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cuda:0


In [None]:
# examples are now sustainably about data
print('----------')
for generated_sequence in finetuned_generator('what is the text about', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
what is the text about what is required here):

(5) The author, who is well versed in the content and history of the text, may well be able to relate it to the context of the present moment.
(6
----------
what is the text about it?' she says. 'And what does it say about the future?'
When I speak to the author of A Tale of Two Cities, a biography of the late John Maynard Keynes, she is quick to acknowledge the
----------
what is the text about it, then what's the relevance?


It is fascinating to see how much the language of science has changed in recent years; not less than 150 years. There is strong evidence it will continue to change in the coming
----------
