# GPT for style completion

In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
pds_data = TextDataset(
    tokenizer= tokenizer,
    file_path="../../data/PDS2.txt",
    block_size=32 # this is the length of each chunk of text to use as a data point
)



In [4]:
pds_data[0] , pds_data[0].shape # inspecting the first entry

(tensor([  200, 47231,  6418,   286,  6060,  5800,   198, 12211,  5061,   198,
           198,    32, 31516,   338,  5698,   284, 13905,  7605,   290,  4583,
           284,   198, 11249,   304,   171,   105,   222, 13967,  1366,    12,
         15808,  5479]),
 torch.Size([32]))

In [5]:
print(tokenizer.decode(pds_data[0]))

Principles of Data Science
Second Edition

A beginner's guide to statistical techniques and theory to
build eﬀective data-driven applications


In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm = False # masked language modelling task.
)

In [10]:
tokenizer.pad_token = tokenizer.eos_token

collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [11]:
collator_example.input_ids

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [13]:
tokenizer.pad_token_id

50256

In [14]:
collator_example.attention_mask

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

attention mask is 0 where there is pad token

In [15]:
collator_example.labels

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [16]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

pretrained_generator = pipeline(
    'text-generation', 
    model=model,
    tokenizer='gpt2',
    config={'max_length':200,
            'do_sample': True,
            'top_p':0.9,
            'temperature': 0.7,
            'top_k': 10}
)

Device set to use mps:0


In [17]:
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A dataset shows the relationships between all the components of a class in different time-series (X = n,Y = t) with some degree of agreement for the given time series in all of the X and Y component classes. The best-fit
----------
A dataset shows the relationships between the various variables from the previous analysis. This was taken as a random assignment (see Supplemental Table S1 [26]), and the correlation coefficients for the variables from the previous analyses were not significantly different than for the variables that
----------
A dataset shows the relationships between obesity, diabetes, and the risk of both coronary heart disease and cancer among US adults over the ages of 36 years. Women are also four times more likely than men in terms of high body mass index (BMI)
----------


In [24]:
# Initialize training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_pds", # The output directory
    overwrite_output_dir=True, # Overwrite the content of the output directory
    num_train_epochs=3, # Number of training epochs
    per_device_train_batch_size=32, # Batch size for training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    warmup_steps=len(pds_data.examples) // 5, # Number of warmup steps for learning rate scheduler
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',       # Save checkpoint at the end of each epoch
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples) * 0.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples) * 0.8):],
)

# Start evaluation
trainer.evaluate()


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'