# Project Part 3

In [1]:
# imports

import pandas as pd
import torch 
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

In [46]:
# load dataset
dataset = load_dataset("biglam/gutenberg-poetry-corpus", split="train")

Using custom data configuration cakiki--gutenberg-poetry-corpus-7745b6aecdad34dc
Found cached dataset parquet (C:/Users/Shayne Kaiser/.cache/huggingface/datasets/biglam___parquet/cakiki--gutenberg-poetry-corpus-7745b6aecdad34dc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [47]:
dataset

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 3085117
})

In [48]:
# convert to Pandas
df = pd.DataFrame(dataset)

In [41]:
print('df shape:', df.shape)

df shape: (3085117, 2)


In [16]:
lines = df["line"]
lines.head()

0    The Song of Hiawatha is based on the legends a...
1    many North American Indian tribes, but especia...
2    Ojibway Indians of northern Michigan, Wisconsi...
3    They were collected by Henry Rowe Schoolcraft,...
4    Schoolcraft married Jane, O-bah-bahm-wawa-ge-z...
Name: line, dtype: object

In [5]:
class PoetryDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [5]:
# function for generating text 
def generate_from_model(model, tokenizer, prompt, max_length=300, temp=1.5, num_outputs=10):
    """
    Tokenize the given prompt, must be one string, and generate output from a  provided model and tokenizer.

    Args:
        mondel (transformers.model): The model being used to generate text
        tokenizer: the tokenizer being used
        prompt (str): The input string that is used to generate text
        max_length (int): Max character length of the generated outputs
        temp (int): Set the temperature for the outputs
        num_outputs (int): number of different outputs to be created
        
    """
    print("Outputs for: " + prompt)

    generated = tokenizer("<|startoftext|> " + prompt, return_tensors="pt").input_ids
    
    sample_outputs = model.generate(generated, max_length=max_length, do_sample=True, top_p=0.95, top_k=50, temperature=temp, num_return_sequences=num_outputs)

    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

## DistilGPT2

In [19]:
# load in model
torch.manual_seed(92)

MODEL_NAME = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [20]:
training_args = TrainingArguments(output_dir='./Models/DistilGPT2', num_train_epochs=1, logging_steps=10000, save_steps=50000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=10, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/DistilGPT2/logs', report_to='none' )

In [21]:
max_length = max([len(tokenizer.encode(line)) for line in lines])

In [22]:
max_length

85

In [23]:
tokenization_dataset = PoetryDataset(lines, tokenizer, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset, val_dataset = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [24]:
train_dataset[0]

(tensor([50257,   818,   262,  7032,   262,   302,   521, 28153,  4836,   606,
         11496, 50256, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [25]:
print(f"Length of Training dataset: {len(train_dataset)}")
print(f"Length of Validation dataset: {len(val_dataset)}")

Length of Training dataset: 2776605
Length of Validation dataset: 308512


### Freezing layers

In [26]:
# check layers in the model
for name, param in model.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.c_attn.weight True
transformer.h.0.attn.c_attn.bias True
transformer.h.0.attn.c_proj.weight True
transformer.h.0.attn.c_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.c_attn.weight True
transformer.h.1.attn.c_attn.bias True
transformer.h.1.attn.c_proj.weight True
transformer.h.1.attn.c_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True
transformer.h.1.mlp.c_fc.bias True
transformer.h.1.mlp.c_proj.weight True
transformer.h.1.mlp.c_proj.bias True
transformer.h.2.ln_1.weight True
transformer.h.2.ln_1.bias True
transformer.h.2.

In [27]:
# freeze the first two layers and 4 hidden units
for name, param in model.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(5)]):
        param.requires_grad = False

In [28]:
# now check layers
for name, param in model.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.c_attn.weight False
transformer.h.0.attn.c_attn.bias False
transformer.h.0.attn.c_proj.weight False
transformer.h.0.attn.c_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.c_attn.weight False
transformer.h.1.attn.c_attn.bias False
transformer.h.1.attn.c_proj.weight False
transformer.h.1.attn.c_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.h.1.mlp.c_fc.weight False
transformer.h.1.mlp.c_fc.bias False
transformer.h.1.mlp.c_proj.weight False
transformer.h.1.mlp.c_proj.bias False
transformer.h.2.ln_1.weight False
transformer.h.2.ln_1

### Train the Model

In [29]:

Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()



***** Running training *****
  Num examples = 2776605
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 277661
  Number of trainable parameters = 7089408


  0%|          | 0/277661 [00:00<?, ?it/s]

{'loss': 0.7205, 'learning_rate': 4.8200978926782185e-05, 'epoch': 0.04}
{'loss': 0.6612, 'learning_rate': 4.640015703166925e-05, 'epoch': 0.07}
{'loss': 0.6561, 'learning_rate': 4.459933513655633e-05, 'epoch': 0.11}
{'loss': 0.651, 'learning_rate': 4.27985132414434e-05, 'epoch': 0.14}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-50000
Configuration saved in ./Models/DistilGPT2\checkpoint-50000\config.json


{'loss': 0.6485, 'learning_rate': 4.099769134633047e-05, 'epoch': 0.18}


Model weights saved in ./Models/DistilGPT2\checkpoint-50000\pytorch_model.bin


{'loss': 0.6451, 'learning_rate': 3.919686945121754e-05, 'epoch': 0.22}
{'loss': 0.6437, 'learning_rate': 3.739604755610461e-05, 'epoch': 0.25}
{'loss': 0.6417, 'learning_rate': 3.559522566099168e-05, 'epoch': 0.29}
{'loss': 0.6399, 'learning_rate': 3.3794403765878745e-05, 'epoch': 0.32}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-100000


{'loss': 0.6389, 'learning_rate': 3.199358187076582e-05, 'epoch': 0.36}


Configuration saved in ./Models/DistilGPT2\checkpoint-100000\config.json
Model weights saved in ./Models/DistilGPT2\checkpoint-100000\pytorch_model.bin


{'loss': 0.6379, 'learning_rate': 3.019275997565289e-05, 'epoch': 0.4}
{'loss': 0.6369, 'learning_rate': 2.839193808053996e-05, 'epoch': 0.43}
{'loss': 0.6371, 'learning_rate': 2.659111618542703e-05, 'epoch': 0.47}
{'loss': 0.6361, 'learning_rate': 2.47902942903141e-05, 'epoch': 0.5}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-150000
Configuration saved in ./Models/DistilGPT2\checkpoint-150000\config.json


{'loss': 0.6335, 'learning_rate': 2.298947239520117e-05, 'epoch': 0.54}


Model weights saved in ./Models/DistilGPT2\checkpoint-150000\pytorch_model.bin


{'loss': 0.6334, 'learning_rate': 2.118865050008824e-05, 'epoch': 0.58}
{'loss': 0.6331, 'learning_rate': 1.938782860497531e-05, 'epoch': 0.61}
{'loss': 0.6335, 'learning_rate': 1.7587006709862385e-05, 'epoch': 0.65}
{'loss': 0.6317, 'learning_rate': 1.5786184814749453e-05, 'epoch': 0.68}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-200000


{'loss': 0.6318, 'learning_rate': 1.3985362919636521e-05, 'epoch': 0.72}


Configuration saved in ./Models/DistilGPT2\checkpoint-200000\config.json
Model weights saved in ./Models/DistilGPT2\checkpoint-200000\pytorch_model.bin


{'loss': 0.632, 'learning_rate': 1.2184541024523593e-05, 'epoch': 0.76}
{'loss': 0.6321, 'learning_rate': 1.0383719129410665e-05, 'epoch': 0.79}
{'loss': 0.6297, 'learning_rate': 8.582897234297733e-06, 'epoch': 0.83}
{'loss': 0.6309, 'learning_rate': 6.782075339184804e-06, 'epoch': 0.86}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-250000
Configuration saved in ./Models/DistilGPT2\checkpoint-250000\config.json


{'loss': 0.6308, 'learning_rate': 4.981253444071874e-06, 'epoch': 0.9}


Model weights saved in ./Models/DistilGPT2\checkpoint-250000\pytorch_model.bin


{'loss': 0.6302, 'learning_rate': 3.1804315489589452e-06, 'epoch': 0.94}
{'loss': 0.63, 'learning_rate': 1.3796096538460155e-06, 'epoch': 0.97}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 11295.2762, 'train_samples_per_second': 245.82, 'train_steps_per_second': 24.582, 'train_loss': 0.6407216623165254, 'epoch': 1.0}


TrainOutput(global_step=277661, training_loss=0.6407216623165254, metrics={'train_runtime': 11295.2762, 'train_samples_per_second': 245.82, 'train_steps_per_second': 24.582, 'train_loss': 0.6407216623165254, 'epoch': 1.0})

In [30]:
# Save model and Tokenizer
model.save_pretrained("./Models/DistilGPT2")

Configuration saved in ./Models/DistilGPT2\config.json
Model weights saved in ./Models/DistilGPT2\pytorch_model.bin


In [31]:
tokenizer.save_pretrained("./Models/DistilGPT2")

tokenizer config file saved in ./Models/DistilGPT2\tokenizer_config.json
Special tokens file saved in ./Models/DistilGPT2\special_tokens_map.json


('./Models/DistilGPT2\\tokenizer_config.json',
 './Models/DistilGPT2\\special_tokens_map.json',
 './Models/DistilGPT2\\vocab.json',
 './Models/DistilGPT2\\merges.txt',
 './Models/DistilGPT2\\added_tokens.json',
 './Models/DistilGPT2\\tokenizer.json')

In [3]:
# load in saved model and tokenizer 
tokenizer = AutoTokenizer.from_pretrained("./Models/DistilGPT2")
model = AutoModelForCausalLM.from_pretrained("./Models/DistilGPT2")

### Generate Outputs

In [33]:
generate_from_model(model, tokenizer, "In view of the fading animals")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: In view of the fading animals
0:  In view of the fading animals
1:  In view of the fading animals which
2:  In view of the fading animals, I will only live for the
3:  In view of the fading animals at its tail,
4:  In view of the fading animals
5:  In view of the fading animals the little flock of young,
6:  In view of the fading animals.
7:  In view of the fading animals
8:  In view of the fading animals who wander and flee.
9:  In view of the fading animals and the lost;


In [34]:
generate_from_model(model, tokenizer, "Shall I compare thee")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Shall I compare thee
0:  Shall I compare thee in some respect not so
1:  Shall I compare thee, with his art so dark
2:  Shall I compare thee and the other?
3:  Shall I compare thee, and tell his son to look in thee!
4:  Shall I compare thee?
5:  Shall I compare thee to me;
6:  Shall I compare thee with thee
7:  Shall I compare thee; and then shall thou see
8:  Shall I compare thee more thy grace and my good,
9:  Shall I compare thee for my son,


In [35]:
generate_from_model(model, tokenizer, "Deep into that darkness")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness
0:  Deep into that darkness beneath his heart!
1:  Deep into that darkness in that bright hour, we knew
2:  Deep into that darkness lay: the night of darkness on us to be
3:  Deep into that darkness; a dream of life
4:  Deep into that darkness,
5:  Deep into that darkness he saw no face. No voice
6:  Deep into that darkness, where the long night
7:  Deep into that darkness his soul in the night falls;
8:  Deep into that darkness lay its dark abyss;
9:  Deep into that darkness they may not lie so long,


In [36]:
long_prompt = 'Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I'

generate_from_model(model, tokenizer, long_prompt)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
0:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I,
1:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
2:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence w

In [33]:
generate_from_model(model, tokenizer, "I stay")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: I stay
0:  I stay too late to get away
1:  I stay, in my arms the wind,
2:  I stay to do his duty.
3:  I stay awake on nights long,
4:  I stay up for a day alone to have
5:  I stay--no longer, no more?--you don't have much to think:
6:  I stay so old and strong as a child,
7:  I stay true to the man I loved--
8:  I stay for nights, I pray till I see my Father--
9:  I stay but never leave it, it shall always be,


In [68]:
# try and create more than one line at a time

prompt = 'The sunrise'

for i in range(5):
    generated = tokenizer("<|startoftext|> " + prompt, return_tensors="pt").input_ids

    sample_outputs = model.generate(generated, max_length=500, do_sample=True, top_p=0.95, top_k=50, temperature=1.9, num_return_sequences=1, pad_token_id=50256)

    line = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)

    lastTwoWords = line.split()[-2:]

    prompt = " ".join(lastTwoWords)

    if i == 0:
        print(line + '\n')
    
    else:
        print(line.split(' ', 3)[3] + '\n')
    


 The sunrise blaring the skies like a thousand stars;

or more were there who made us, which, while

all those who might have guessed at how

in the day shall happen, from afar are things more

wise? for you of that.



## GPT-Neo

In [9]:
# load in model
torch.manual_seed(92)

MODEL_NAME = "EleutherAI/gpt-neo-1.3B"

tokenizer_neo = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model_neo = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model_neo.resize_token_embeddings(len(tokenizer_neo))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 2048)

In [10]:
# Traing arguments
training_args_neo = TrainingArguments(output_dir='./Models/GPT-Neo', num_train_epochs=1, logging_steps=10000, save_steps=500000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=10, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/GPT-Neo/logs', report_to='none' )

### Tokenize data

In [11]:
max_length = max([len(tokenizer_neo.encode(line)) for line in lines])

In [12]:
max_length

85

In [13]:
tokenization_dataset = PoetryDataset(lines, tokenizer_neo, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset_neo, val_dataset_neo = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [14]:
train_dataset_neo[0]

(tensor([50257,  3152,   257,  9480,  2786,   273,    11,   543,    11,   996,
           284,   262,  4151, 50256, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [15]:
print(f"Length of Training dataset: {len(train_dataset_neo)}")
print(f"Length of Validation dataset: {len(val_dataset_neo)}")

Length of Training dataset: 2776605
Length of Validation dataset: 308512


### Freeze layers

In [16]:
for name, param in model_neo.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.attention.k_proj.weight True
transformer.h.0.attn.attention.v_proj.weight True
transformer.h.0.attn.attention.q_proj.weight True
transformer.h.0.attn.attention.out_proj.weight True
transformer.h.0.attn.attention.out_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.attention.k_proj.weight True
transformer.h.1.attn.attention.v_proj.weight True
transformer.h.1.attn.attention.q_proj.weight True
transformer.h.1.attn.attention.out_proj.weight True
transformer.h.1.attn.attention.out_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True

In [17]:
# freeze the first two layers and 23 hidden units
for name, param in model_neo.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(23)]):
        param.requires_grad = False

In [18]:
# check layers
for name, param in model_neo.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.attention.k_proj.weight False
transformer.h.0.attn.attention.v_proj.weight False
transformer.h.0.attn.attention.q_proj.weight False
transformer.h.0.attn.attention.out_proj.weight False
transformer.h.0.attn.attention.out_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.attention.k_proj.weight False
transformer.h.1.attn.attention.v_proj.weight False
transformer.h.1.attn.attention.q_proj.weight False
transformer.h.1.attn.attention.out_proj.weight False
transformer.h.1.attn.attention.out_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.

### Train the model

In [19]:
Trainer(model=model_neo,  args=training_args_neo, train_dataset=train_dataset_neo, 
        eval_dataset=val_dataset_neo, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 2776605
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 277661
  Number of trainable parameters = 50356224


  0%|          | 0/277661 [00:00<?, ?it/s]

{'loss': 0.8757, 'learning_rate': 4.8200978926782185e-05, 'epoch': 0.04}
{'loss': 0.8238, 'learning_rate': 4.640015703166925e-05, 'epoch': 0.07}
{'loss': 0.8112, 'learning_rate': 4.459933513655633e-05, 'epoch': 0.11}
{'loss': 0.8019, 'learning_rate': 4.27985132414434e-05, 'epoch': 0.14}
{'loss': 0.7945, 'learning_rate': 4.099769134633047e-05, 'epoch': 0.18}
{'loss': 0.7902, 'learning_rate': 3.919686945121754e-05, 'epoch': 0.22}
{'loss': 0.7844, 'learning_rate': 3.739604755610461e-05, 'epoch': 0.25}
{'loss': 0.7811, 'learning_rate': 3.559522566099168e-05, 'epoch': 0.29}
{'loss': 0.7778, 'learning_rate': 3.3794403765878745e-05, 'epoch': 0.32}
{'loss': 0.7751, 'learning_rate': 3.199358187076582e-05, 'epoch': 0.36}
{'loss': 0.7725, 'learning_rate': 3.019275997565289e-05, 'epoch': 0.4}
{'loss': 0.7696, 'learning_rate': 2.839193808053996e-05, 'epoch': 0.43}
{'loss': 0.7676, 'learning_rate': 2.659111618542703e-05, 'epoch': 0.47}
{'loss': 0.7668, 'learning_rate': 2.47902942903141e-05, 'epoch':



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 123748.8893, 'train_samples_per_second': 22.437, 'train_steps_per_second': 2.244, 'train_loss': 0.7741056002052638, 'epoch': 1.0}


TrainOutput(global_step=277661, training_loss=0.7741056002052638, metrics={'train_runtime': 123748.8893, 'train_samples_per_second': 22.437, 'train_steps_per_second': 2.244, 'train_loss': 0.7741056002052638, 'epoch': 1.0})

In [20]:
# Save model and Tokenizer
model_neo.save_pretrained("./Models/GPT-Neo")

Configuration saved in ./Models/GPT-Neo\config.json
Model weights saved in ./Models/GPT-Neo\pytorch_model.bin


In [21]:
tokenizer_neo.save_pretrained("./Models/GPT-Neo")

tokenizer config file saved in ./Models/GPT-Neo\tokenizer_config.json
Special tokens file saved in ./Models/GPT-Neo\special_tokens_map.json


('./Models/GPT-Neo\\tokenizer_config.json',
 './Models/GPT-Neo\\special_tokens_map.json',
 './Models/GPT-Neo\\vocab.json',
 './Models/GPT-Neo\\merges.txt',
 './Models/GPT-Neo\\added_tokens.json',
 './Models/GPT-Neo\\tokenizer.json')

In [3]:
# load in saved model and tokenizer 
tokenizer_neo = AutoTokenizer.from_pretrained("./Models/GPT-Neo")
model_neo = AutoModelForCausalLM.from_pretrained("./Models/GPT-Neo")

### Generate Text

In [45]:
generate_from_model(model_neo, tokenizer_neo, "In view of the fading animals")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: In view of the fading animals
0:  In view of the fading animals.
1:  In view of the fading animals to
2:  In view of the fading animals with his eye!
3:  In view of the fading animals, "with. a new
4:  In view of the fading animals that he he been the nighthe on;
5:  In view of the fading animals.' in the to no by. I,
6:  In view of the fading animals,
7:  In view of the fading animals is our
8:  In view of the fading animals;
9:  In view of the fading animals of the same:


In [6]:
generate_from_model(model_neo, tokenizer_neo, "Shall I compare thee")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Shall I compare thee
0:  Shall I compare thee he did the earth and still
1:  Shall I compare thee-e the light is they thought youre-like the
2:  Shall I compare thee thou on thee be.
3:  Shall I compare thee the words, and her life
4:  Shall I compare thee, I said; I in life was, of
5:  Shall I compare thee in your soul from each me the sea
6:  Shall I compare thee! and I shall come see had it me:?
7:  Shall I compare thee the years He”--
8:  Shall I compare thee
9:  Shall I compare thee for But who voice it?


In [29]:
generate_from_model(model, tokenizer, "Deep into that darkness")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness
0:  Deep into that darkness that never sees--and knows--."
1:  Deep into that darkness."  He paused then:
2:  Deep into that darkness, with the sea waves,
3:  Deep into that darkness, and his blood a mist in her,
4:  Deep into that darkness the darkness. I stand;
5:  Deep into that darkness in that sky?...
6:  Deep into that darkness
7:  Deep into that darkness,--who at this moment
8:  Deep into that darkness; and this life
9:  Deep into that darkness


In [30]:
long_prompt = 'Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I'

generate_from_model(model, tokenizer, long_prompt)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
0:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
1:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I.
2:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence w

## Generate Whole Poems with DistilGPT2


In [56]:
# Create a new dataframe that combines 25 lines at a time

df_list = df['line'].tolist()

split_list = list()
split_size = 25

for i in range(0, len(df_list), split_size):
    split_list.append(df_list[i:i+split_size])

for i in range(0, len(split_list)):
    split_list[i] = ' '.join(split_list[i])

df_longer_lines = pd.DataFrame(split_list, columns=['line'])

In [57]:
df_longer_lines['line'].iloc[542]

"The Son of God to judge them, terrifi'd Hee fled, not hoping to escape, but shun The present, fearing guiltie what his wrauth Might suddenly inflict; that past, return'd By Night, and listning where the hapless Paire Sate in thir sad discourse, and various plaint, Thence gatherd his own doom, which understood Not instant, but of future time.  With joy And tidings fraught, to Hell he now return'd, And at the brink of CHAOS, neer the foot Of this new wondrous Pontifice, unhop't Met who to meet him came, his Ofspring dear. Great joy was at thir meeting, and at sight Of that stupendious Bridge his joy encreas'd. Long hee admiring stood, till Sin, his faire Inchanting Daughter, thus the silence broke. O Parent, these are thy magnific deeds, Thy Trophies, which thou view'st as not thine own, Thou art thir Author and prime Architect: For I no sooner in my Heart divin'd, My Heart, which by a secret harmonie Still moves with thine, joyn'd in connexion sweet, That thou on Earth hadst prosper'd,

In [8]:
print(df_longer_lines.shape)

(123405, 1)


In [9]:
longer_lines = df_longer_lines['line']
longer_lines

0         The Song of Hiawatha is based on the legends a...
1         Little, flitting, white-fire insect Little, da...
2         Should you ask where Nawadaha Found these song...
3         And beyond them stood the forest, Stood the gr...
4         Through their palisades of pine-trees, And the...
                                ...                        
123400    Pepulitque noctis umbras vegetis sonipedibus. ...
123401    Tempe quae silvae cingunt super inpendentes,--...
123402    Lux mea qua viva vivere dulce mihi'st. In this...
123403    Oh then, full surely thy Quintilia's woe. For ...
123404    And his face the index be, Of his mother's cha...
Name: line, Length: 123405, dtype: object

In [10]:
# load in model
# using distilgpt2 again because it's faster to train
torch.manual_seed(92)

MODEL_NAME = "distilgpt2"

tokenizer_poem = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model_poem = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model_poem.resize_token_embeddings(len(tokenizer_poem))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [11]:
training_args = TrainingArguments(output_dir='./Models/PoemDistil', num_train_epochs=1, logging_steps=10000, save_steps=50000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/PoemDistil/logs', report_to='none' )

In [12]:
max_length = max([len(tokenizer_poem.encode(line)) for line in longer_lines])

In [13]:
max_length

965

In [14]:
tokenization_dataset = PoetryDataset(longer_lines, tokenizer_poem, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset, val_dataset = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [15]:
train_dataset[0]

(tensor([50257,  2504,   340,   318,   477,   465,  8716,   290, 20788, 11459,
           373,   262,  1110,    11,   355,   314,   423,  1297,   304,   260,
           428,    11,   843,  2312,   385,    11,   351, 28654,  8716,   290,
         30533,    11,  2080,   465, 33812,  3366,  8326,    11,   262, 37063,
         16599,    11,   843, 17608,    11,   331,    12,   565,   313,   704,
           477,   287,  4077,    11,  1550, 10988,   307,   484, 46715, 10611,
           453,    13,   843,   284,   262,  7128,   303,    11,   326,  6204,
           612,  3049,    68,   416,    11,   554,   543,   612,   373,   281,
           289,   433,    11,   355,  1450,   683,  1297,    11, 11083,  2312,
           385,   262,  3892,    68,   835,   288,   849,  1745,    11,   843,
           625,   257,  1379,   482,    11,   290,   523,  6071,   319,   465,
           835,    13,   770, 11083,   481,   423,   257,  1781,   379,   683,
           393,   665,   323,  4698,   262,  4252,  

### Freeze Layers

In [16]:
# freeze the first two layers and 4 hidden units
for name, param in model_poem.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(5)]):
        param.requires_grad = False

In [17]:
# now check layers
for name, param in model_poem.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.c_attn.weight False
transformer.h.0.attn.c_attn.bias False
transformer.h.0.attn.c_proj.weight False
transformer.h.0.attn.c_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.c_attn.weight False
transformer.h.1.attn.c_attn.bias False
transformer.h.1.attn.c_proj.weight False
transformer.h.1.attn.c_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.h.1.mlp.c_fc.weight False
transformer.h.1.mlp.c_fc.bias False
transformer.h.1.mlp.c_proj.weight False
transformer.h.1.mlp.c_proj.bias False
transformer.h.2.ln_1.weight False
transformer.h.2.ln_1

### Train the Model

In [18]:
Trainer(model=model_poem,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 111064
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 111064
  Number of trainable parameters = 7089408


  0%|          | 0/111064 [00:00<?, ?it/s]

{'loss': 1.3947, 'learning_rate': 4.550218812469609e-05, 'epoch': 0.09}
{'loss': 1.3105, 'learning_rate': 4.09998739352027e-05, 'epoch': 0.18}
{'loss': 1.3076, 'learning_rate': 3.6497559745709296e-05, 'epoch': 0.27}
{'loss': 1.2994, 'learning_rate': 3.19952455562159e-05, 'epoch': 0.36}


Saving model checkpoint to ./Models/PoemDistil\checkpoint-50000
Configuration saved in ./Models/PoemDistil\checkpoint-50000\config.json


{'loss': 1.2903, 'learning_rate': 2.7492931366722495e-05, 'epoch': 0.45}


Model weights saved in ./Models/PoemDistil\checkpoint-50000\pytorch_model.bin


{'loss': 1.2875, 'learning_rate': 2.2990617177229094e-05, 'epoch': 0.54}
{'loss': 1.2864, 'learning_rate': 1.8488302987735697e-05, 'epoch': 0.63}
{'loss': 1.2792, 'learning_rate': 1.3985988798242298e-05, 'epoch': 0.72}
{'loss': 1.2837, 'learning_rate': 9.483674608748897e-06, 'epoch': 0.81}


Saving model checkpoint to ./Models/PoemDistil\checkpoint-100000
Configuration saved in ./Models/PoemDistil\checkpoint-100000\config.json


{'loss': 1.2817, 'learning_rate': 4.981360419255498e-06, 'epoch': 0.9}


Model weights saved in ./Models/PoemDistil\checkpoint-100000\pytorch_model.bin


{'loss': 1.2761, 'learning_rate': 4.790462297620977e-07, 'epoch': 0.99}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 6025.1475, 'train_samples_per_second': 18.433, 'train_steps_per_second': 18.433, 'train_loss': 1.2994928379906519, 'epoch': 1.0}


TrainOutput(global_step=111064, training_loss=1.2994928379906519, metrics={'train_runtime': 6025.1475, 'train_samples_per_second': 18.433, 'train_steps_per_second': 18.433, 'train_loss': 1.2994928379906519, 'epoch': 1.0})

In [19]:
# Save model and Tokenizer
model_poem.save_pretrained("./Models/PoemDistil")

Configuration saved in ./Models/PoemDistil\config.json
Model weights saved in ./Models/PoemDistil\pytorch_model.bin


In [20]:
tokenizer_poem.save_pretrained("./Models/PoemDistil")

tokenizer config file saved in ./Models/PoemDistil\tokenizer_config.json
Special tokens file saved in ./Models/PoemDistil\special_tokens_map.json


('./Models/PoemDistil\\tokenizer_config.json',
 './Models/PoemDistil\\special_tokens_map.json',
 './Models/PoemDistil\\vocab.json',
 './Models/PoemDistil\\merges.txt',
 './Models/PoemDistil\\added_tokens.json',
 './Models/PoemDistil\\tokenizer.json')

In [7]:
# load in saved model and tokenizer 
tokenizer_poem = AutoTokenizer.from_pretrained("./Models/PoemDistil")
model_poem = AutoModelForCausalLM.from_pretrained("./Models/PoemDistil")

### Generate Outputs


In [12]:
generate_from_model(model_poem, tokenizer_poem,"In view of the fading animals", temp=1.5, num_outputs=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: In view of the fading animals
0:  In view of the fading animals' memory, Weigh in, that, with that they'd passed The bright moon that never came, Nor from heaven, nor their moon in one land-- Nor from his house in the field of dreams to the night's light, Which was made him more, in heaven or in Paradise-- As by the ways, though he could be no mortal, Which came not like thee? It seemed he only had a mind Which should have been no mortal for;--for he lived,--who now, and still He made, to sleep, who sleep. There the thought took off-- And soon he came again as the sea, Where are he? His footsteps have pierced him, I go, when to sleep The waves have run along the sea's banks; He went with them, he left them here, To wake me and me--for what I said. He went a journey by walking in all the air, In that sea I wandered there; In that sea I never went, Where should my soul's will be, If which a word to make, or sound That say would come, in the ocean that I never heard. There we

In [60]:
generate_from_model(model_poem, tokenizer_poem,"Shall I compare thee", temp=1.5, num_outputs=2, max_length=150)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Shall I compare thee
0:  Shall I compare thee, or say with thy soul as to what man. So will no man. In the world it was not in this same age, Then did my Father be of some great thing, Yet did man have no life beyond the reach Of his very bosoms. I must have thought about how we saw the other; For one of our fathers said to his mother's door In an unknown hour as to whom she fell:  but thou dost say Thy words which my mother cried on." And so long he dost say,--I cannot but say of thee "Take me into this house and thou wouldst not: Thou shalt no man stand alone And my only way I might live." He said unto my father,
1:  Shall I compare thee with him to whom thyself will suffer, O I see His mercy, The holy God, the divine, the living; My love shall be as great as ever, O I shall bear his Son in mine! He will be with Him there in that world, for my God Has sent her to bless me. 'Tis yet more a little to love: O thou may think it are my heart to love thy heart. Thou art not su

In [8]:
generate_from_model(model_poem, tokenizer_poem,"Deep into that darkness", temp=1.5, num_outputs=2, max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness
0:  Deep into that darkness there he sat upon the tree That lies where it fell Upon a mountain which spilt. Now, after night a great tree was cast Down into hell, Through its darkened hollows and gray vines That rose, for its sake the fruit and fruit it fell Where it fell Its fruit hung on its branches above. And on that dark, I heard a long, long way passing, That I hear it through its open wide open open sky, On some hill of grass and with the snow It rose that broke its leaves where it fell. Then, on certain part it floated Through the narrow and flat blackness In the snow and snow, I could not breathe. It flew over and I saw its beauty with eyes; On that dark dark, in its coldness Its beauty. And now here it fell A great leaf as a shadow that spread Over the sky! Where is the leaf that lay Here? A hollow leaf of snow from below it! It was born with it a heart, a blood and blood-pleas'd tree, To a dead earth its own kind and death.  Where it came

In [10]:
long_prompt = 'Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I'

generate_from_model(model_poem, tokenizer_poem, long_prompt, num_outputs=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
0:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I thought, and I answered, "What should be!" and I asked, "A mother so old and so ill! Do you know your face and this? It is not my mother!" I replied again, "I cannot know for you I only know! I cannot tell if it is her heart, because he's not at your hands; And if you can do your best, He's not at the heart." My father's face was wide awake, so all my heart was dark That had grown green And the sea trembled as the wind blew My mind a

In [14]:
generate_from_model(model_poem, tokenizer_poem,"Finally finished", temp=1.5, num_outputs=5, max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Finally finished
0:  Finally finished with me to see them grow: And they all did so with delight and their good faces. It's all we can say: And then he gave you their gifts in my bosom; I wish them no care. But a dream was born in his bosom and full-blown, And a child, my heart beat fast to the ground, And in the bosom gave you life for him, with full grace. O baby of your love is yours all too strong as your mother; As you had your parents from me. But I had you on board a young life-long journey That led in you and my sister--no one came so near the world To greet the people, to make your mother happy again. It was me; but a child I had seen that had grown and grow-- And a daughter born into your bosom was a friend of hers. She was a man born from the bosom, an equal time with the mother's own. O son from the bosom! A beautiful baby from a sweet child is such a dear man-- For every life you live, he gave for me, for it brought and made, O baby of your love and the happy 