# Project Part 3

In [11]:
# imports

import pandas as pd
import torch 
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

In [12]:
# load dataset
dataset = load_dataset("biglam/gutenberg-poetry-corpus", split="train")

Using custom data configuration cakiki--gutenberg-poetry-corpus-7745b6aecdad34dc
Found cached dataset parquet (C:/Users/Shayne Kaiser/.cache/huggingface/datasets/biglam___parquet/cakiki--gutenberg-poetry-corpus-7745b6aecdad34dc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [13]:
dataset

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 3085117
})

In [14]:
# convert to Pandas
df = pd.DataFrame(dataset)

In [15]:
print('df shape:', df.shape)

df shape: (3085117, 2)


In [16]:
lines = df["line"]
lines.head()

0    The Song of Hiawatha is based on the legends a...
1    many North American Indian tribes, but especia...
2    Ojibway Indians of northern Michigan, Wisconsi...
3    They were collected by Henry Rowe Schoolcraft,...
4    Schoolcraft married Jane, O-bah-bahm-wawa-ge-z...
Name: line, dtype: object

In [17]:
class PoetryDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [18]:
# function for generating text 
def generate_from_model(model, tokenizer, prompt, max_length=300, temp=1.5, num_outputs=10):
    """
    Tokenize the given prompt, must be one string, and generate output from a  provided model and tokenizer.

    Args:
        mondel (transformers.model): The model being used to generate text
        tokenizer: the tokenizer being used
        prompt (str): The input string that is used to generate text
        max_length (int): Max character length of the generated outputs
        temp (int): Set the temperature for the outputs
        num_outputs (int): number of different outputs to be created
        
    """
    print("Outputs for: " + prompt)

    generated = tokenizer("<|startoftext|> " + prompt, return_tensors="pt").input_ids
    
    sample_outputs = model.generate(generated, max_length=max_length, do_sample=True, top_p=0.95, top_k=50, temperature=temp, num_return_sequences=num_outputs)

    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

## DistilGPT2

In [19]:
# load in model
torch.manual_seed(92)

MODEL_NAME = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [20]:
training_args = TrainingArguments(output_dir='./Models/DistilGPT2', num_train_epochs=1, logging_steps=10000, save_steps=50000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=10, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/DistilGPT2/logs', report_to='none' )

In [21]:
max_length = max([len(tokenizer.encode(line)) for line in lines])

In [22]:
max_length

85

In [23]:
tokenization_dataset = PoetryDataset(lines, tokenizer, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset, val_dataset = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [24]:
train_dataset[0]

(tensor([50257,   818,   262,  7032,   262,   302,   521, 28153,  4836,   606,
         11496, 50256, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [25]:
print(f"Length of Training dataset: {len(train_dataset)}")
print(f"Length of Validation dataset: {len(val_dataset)}")

Length of Training dataset: 2776605
Length of Validation dataset: 308512


### Freezing layers

In [26]:
# check layers in the model
for name, param in model.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.c_attn.weight True
transformer.h.0.attn.c_attn.bias True
transformer.h.0.attn.c_proj.weight True
transformer.h.0.attn.c_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.c_attn.weight True
transformer.h.1.attn.c_attn.bias True
transformer.h.1.attn.c_proj.weight True
transformer.h.1.attn.c_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True
transformer.h.1.mlp.c_fc.bias True
transformer.h.1.mlp.c_proj.weight True
transformer.h.1.mlp.c_proj.bias True
transformer.h.2.ln_1.weight True
transformer.h.2.ln_1.bias True
transformer.h.2.

In [27]:
# freeze the first two layers and 4 hidden units
for name, param in model.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(5)]):
        param.requires_grad = False

In [28]:
# now check layers
for name, param in model.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.c_attn.weight False
transformer.h.0.attn.c_attn.bias False
transformer.h.0.attn.c_proj.weight False
transformer.h.0.attn.c_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.c_attn.weight False
transformer.h.1.attn.c_attn.bias False
transformer.h.1.attn.c_proj.weight False
transformer.h.1.attn.c_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.h.1.mlp.c_fc.weight False
transformer.h.1.mlp.c_fc.bias False
transformer.h.1.mlp.c_proj.weight False
transformer.h.1.mlp.c_proj.bias False
transformer.h.2.ln_1.weight False
transformer.h.2.ln_1

### Train the Model

In [29]:

Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()



***** Running training *****
  Num examples = 2776605
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 277661
  Number of trainable parameters = 7089408


  0%|          | 0/277661 [00:00<?, ?it/s]

{'loss': 0.7205, 'learning_rate': 4.8200978926782185e-05, 'epoch': 0.04}
{'loss': 0.6612, 'learning_rate': 4.640015703166925e-05, 'epoch': 0.07}
{'loss': 0.6561, 'learning_rate': 4.459933513655633e-05, 'epoch': 0.11}
{'loss': 0.651, 'learning_rate': 4.27985132414434e-05, 'epoch': 0.14}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-50000
Configuration saved in ./Models/DistilGPT2\checkpoint-50000\config.json


{'loss': 0.6485, 'learning_rate': 4.099769134633047e-05, 'epoch': 0.18}


Model weights saved in ./Models/DistilGPT2\checkpoint-50000\pytorch_model.bin


{'loss': 0.6451, 'learning_rate': 3.919686945121754e-05, 'epoch': 0.22}
{'loss': 0.6437, 'learning_rate': 3.739604755610461e-05, 'epoch': 0.25}
{'loss': 0.6417, 'learning_rate': 3.559522566099168e-05, 'epoch': 0.29}
{'loss': 0.6399, 'learning_rate': 3.3794403765878745e-05, 'epoch': 0.32}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-100000


{'loss': 0.6389, 'learning_rate': 3.199358187076582e-05, 'epoch': 0.36}


Configuration saved in ./Models/DistilGPT2\checkpoint-100000\config.json
Model weights saved in ./Models/DistilGPT2\checkpoint-100000\pytorch_model.bin


{'loss': 0.6379, 'learning_rate': 3.019275997565289e-05, 'epoch': 0.4}
{'loss': 0.6369, 'learning_rate': 2.839193808053996e-05, 'epoch': 0.43}
{'loss': 0.6371, 'learning_rate': 2.659111618542703e-05, 'epoch': 0.47}
{'loss': 0.6361, 'learning_rate': 2.47902942903141e-05, 'epoch': 0.5}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-150000
Configuration saved in ./Models/DistilGPT2\checkpoint-150000\config.json


{'loss': 0.6335, 'learning_rate': 2.298947239520117e-05, 'epoch': 0.54}


Model weights saved in ./Models/DistilGPT2\checkpoint-150000\pytorch_model.bin


{'loss': 0.6334, 'learning_rate': 2.118865050008824e-05, 'epoch': 0.58}
{'loss': 0.6331, 'learning_rate': 1.938782860497531e-05, 'epoch': 0.61}
{'loss': 0.6335, 'learning_rate': 1.7587006709862385e-05, 'epoch': 0.65}
{'loss': 0.6317, 'learning_rate': 1.5786184814749453e-05, 'epoch': 0.68}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-200000


{'loss': 0.6318, 'learning_rate': 1.3985362919636521e-05, 'epoch': 0.72}


Configuration saved in ./Models/DistilGPT2\checkpoint-200000\config.json
Model weights saved in ./Models/DistilGPT2\checkpoint-200000\pytorch_model.bin


{'loss': 0.632, 'learning_rate': 1.2184541024523593e-05, 'epoch': 0.76}
{'loss': 0.6321, 'learning_rate': 1.0383719129410665e-05, 'epoch': 0.79}
{'loss': 0.6297, 'learning_rate': 8.582897234297733e-06, 'epoch': 0.83}
{'loss': 0.6309, 'learning_rate': 6.782075339184804e-06, 'epoch': 0.86}


Saving model checkpoint to ./Models/DistilGPT2\checkpoint-250000
Configuration saved in ./Models/DistilGPT2\checkpoint-250000\config.json


{'loss': 0.6308, 'learning_rate': 4.981253444071874e-06, 'epoch': 0.9}


Model weights saved in ./Models/DistilGPT2\checkpoint-250000\pytorch_model.bin


{'loss': 0.6302, 'learning_rate': 3.1804315489589452e-06, 'epoch': 0.94}
{'loss': 0.63, 'learning_rate': 1.3796096538460155e-06, 'epoch': 0.97}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 11295.2762, 'train_samples_per_second': 245.82, 'train_steps_per_second': 24.582, 'train_loss': 0.6407216623165254, 'epoch': 1.0}


TrainOutput(global_step=277661, training_loss=0.6407216623165254, metrics={'train_runtime': 11295.2762, 'train_samples_per_second': 245.82, 'train_steps_per_second': 24.582, 'train_loss': 0.6407216623165254, 'epoch': 1.0})

In [30]:
# Save model and Tokenizer
model.save_pretrained("./Models/DistilGPT2")

Configuration saved in ./Models/DistilGPT2\config.json
Model weights saved in ./Models/DistilGPT2\pytorch_model.bin


In [31]:
tokenizer.save_pretrained("./Models/DistilGPT2")

tokenizer config file saved in ./Models/DistilGPT2\tokenizer_config.json
Special tokens file saved in ./Models/DistilGPT2\special_tokens_map.json


('./Models/DistilGPT2\\tokenizer_config.json',
 './Models/DistilGPT2\\special_tokens_map.json',
 './Models/DistilGPT2\\vocab.json',
 './Models/DistilGPT2\\merges.txt',
 './Models/DistilGPT2\\added_tokens.json',
 './Models/DistilGPT2\\tokenizer.json')

In [32]:
# load in saved model and tokenizer 
tokenizer = AutoTokenizer.from_pretrained("./Models/DistilGPT2")
model = AutoModelForCausalLM.from_pretrained("./Models/DistilGPT2")

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./Models/DistilGPT2\config.json
Model config GPT2Config {
  "_name_or_path": "./Models/DistilGPT2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,

### Generate Outputs

In [33]:
generate_from_model(model, tokenizer, "In view of the fading animals")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: In view of the fading animals
0:  In view of the fading animals
1:  In view of the fading animals which
2:  In view of the fading animals, I will only live for the
3:  In view of the fading animals at its tail,
4:  In view of the fading animals
5:  In view of the fading animals the little flock of young,
6:  In view of the fading animals.
7:  In view of the fading animals
8:  In view of the fading animals who wander and flee.
9:  In view of the fading animals and the lost;


In [34]:
generate_from_model(model, tokenizer, "Shall I compare thee")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Shall I compare thee
0:  Shall I compare thee in some respect not so
1:  Shall I compare thee, with his art so dark
2:  Shall I compare thee and the other?
3:  Shall I compare thee, and tell his son to look in thee!
4:  Shall I compare thee?
5:  Shall I compare thee to me;
6:  Shall I compare thee with thee
7:  Shall I compare thee; and then shall thou see
8:  Shall I compare thee more thy grace and my good,
9:  Shall I compare thee for my son,


In [35]:
generate_from_model(model, tokenizer, "Deep into that darkness")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness
0:  Deep into that darkness beneath his heart!
1:  Deep into that darkness in that bright hour, we knew
2:  Deep into that darkness lay: the night of darkness on us to be
3:  Deep into that darkness; a dream of life
4:  Deep into that darkness,
5:  Deep into that darkness he saw no face. No voice
6:  Deep into that darkness, where the long night
7:  Deep into that darkness his soul in the night falls;
8:  Deep into that darkness lay its dark abyss;
9:  Deep into that darkness they may not lie so long,


In [36]:
long_prompt = 'Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I'

generate_from_model(model, tokenizer, long_prompt)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
0:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I,
1:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
2:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence w

In [33]:
generate_from_model(model, tokenizer, "I stay")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: I stay
0:  I stay too late to get away
1:  I stay, in my arms the wind,
2:  I stay to do his duty.
3:  I stay awake on nights long,
4:  I stay up for a day alone to have
5:  I stay--no longer, no more?--you don't have much to think:
6:  I stay so old and strong as a child,
7:  I stay true to the man I loved--
8:  I stay for nights, I pray till I see my Father--
9:  I stay but never leave it, it shall always be,


## GPT-Neo

In [9]:
# load in model
torch.manual_seed(92)

MODEL_NAME = "EleutherAI/gpt-neo-1.3B"

tokenizer_neo = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model_neo = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model_neo.resize_token_embeddings(len(tokenizer_neo))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 2048)

In [10]:
# Traing arguments
training_args_neo = TrainingArguments(output_dir='./Models/GPT-Neo', num_train_epochs=1, logging_steps=10000, save_steps=500000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=10, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/GPT-Neo/logs', report_to='none' )

### Tokenize data

In [11]:
max_length = max([len(tokenizer_neo.encode(line)) for line in lines])

In [12]:
max_length

85

In [13]:
tokenization_dataset = PoetryDataset(lines, tokenizer_neo, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset_neo, val_dataset_neo = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [14]:
train_dataset_neo[0]

(tensor([50257,  3152,   257,  9480,  2786,   273,    11,   543,    11,   996,
           284,   262,  4151, 50256, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [15]:
print(f"Length of Training dataset: {len(train_dataset_neo)}")
print(f"Length of Validation dataset: {len(val_dataset_neo)}")

Length of Training dataset: 2776605
Length of Validation dataset: 308512


### Freeze layers

In [16]:
for name, param in model_neo.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.attention.k_proj.weight True
transformer.h.0.attn.attention.v_proj.weight True
transformer.h.0.attn.attention.q_proj.weight True
transformer.h.0.attn.attention.out_proj.weight True
transformer.h.0.attn.attention.out_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.attention.k_proj.weight True
transformer.h.1.attn.attention.v_proj.weight True
transformer.h.1.attn.attention.q_proj.weight True
transformer.h.1.attn.attention.out_proj.weight True
transformer.h.1.attn.attention.out_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True

In [17]:
# freeze the first two layers and 23 hidden units
for name, param in model_neo.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(23)]):
        param.requires_grad = False

In [18]:
# check layers
for name, param in model_neo.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.attention.k_proj.weight False
transformer.h.0.attn.attention.v_proj.weight False
transformer.h.0.attn.attention.q_proj.weight False
transformer.h.0.attn.attention.out_proj.weight False
transformer.h.0.attn.attention.out_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.attention.k_proj.weight False
transformer.h.1.attn.attention.v_proj.weight False
transformer.h.1.attn.attention.q_proj.weight False
transformer.h.1.attn.attention.out_proj.weight False
transformer.h.1.attn.attention.out_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.

### Train the model

In [19]:
Trainer(model=model_neo,  args=training_args_neo, train_dataset=train_dataset_neo, 
        eval_dataset=val_dataset_neo, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 2776605
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 277661
  Number of trainable parameters = 50356224


  0%|          | 0/277661 [00:00<?, ?it/s]

{'loss': 0.8757, 'learning_rate': 4.8200978926782185e-05, 'epoch': 0.04}
{'loss': 0.8238, 'learning_rate': 4.640015703166925e-05, 'epoch': 0.07}
{'loss': 0.8112, 'learning_rate': 4.459933513655633e-05, 'epoch': 0.11}
{'loss': 0.8019, 'learning_rate': 4.27985132414434e-05, 'epoch': 0.14}
{'loss': 0.7945, 'learning_rate': 4.099769134633047e-05, 'epoch': 0.18}
{'loss': 0.7902, 'learning_rate': 3.919686945121754e-05, 'epoch': 0.22}
{'loss': 0.7844, 'learning_rate': 3.739604755610461e-05, 'epoch': 0.25}
{'loss': 0.7811, 'learning_rate': 3.559522566099168e-05, 'epoch': 0.29}
{'loss': 0.7778, 'learning_rate': 3.3794403765878745e-05, 'epoch': 0.32}
{'loss': 0.7751, 'learning_rate': 3.199358187076582e-05, 'epoch': 0.36}
{'loss': 0.7725, 'learning_rate': 3.019275997565289e-05, 'epoch': 0.4}
{'loss': 0.7696, 'learning_rate': 2.839193808053996e-05, 'epoch': 0.43}
{'loss': 0.7676, 'learning_rate': 2.659111618542703e-05, 'epoch': 0.47}
{'loss': 0.7668, 'learning_rate': 2.47902942903141e-05, 'epoch':



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 123748.8893, 'train_samples_per_second': 22.437, 'train_steps_per_second': 2.244, 'train_loss': 0.7741056002052638, 'epoch': 1.0}


TrainOutput(global_step=277661, training_loss=0.7741056002052638, metrics={'train_runtime': 123748.8893, 'train_samples_per_second': 22.437, 'train_steps_per_second': 2.244, 'train_loss': 0.7741056002052638, 'epoch': 1.0})

In [20]:
# Save model and Tokenizer
model_neo.save_pretrained("./Models/GPT-Neo")

Configuration saved in ./Models/GPT-Neo\config.json
Model weights saved in ./Models/GPT-Neo\pytorch_model.bin


In [21]:
tokenizer_neo.save_pretrained("./Models/GPT-Neo")

tokenizer config file saved in ./Models/GPT-Neo\tokenizer_config.json
Special tokens file saved in ./Models/GPT-Neo\special_tokens_map.json


('./Models/GPT-Neo\\tokenizer_config.json',
 './Models/GPT-Neo\\special_tokens_map.json',
 './Models/GPT-Neo\\vocab.json',
 './Models/GPT-Neo\\merges.txt',
 './Models/GPT-Neo\\added_tokens.json',
 './Models/GPT-Neo\\tokenizer.json')

In [26]:
# load in saved model and tokenizer 
tokenizer_neo = AutoTokenizer.from_pretrained("./Models/GPT-Neo")
model_neo = AutoModelForCausalLM.from_pretrained("./Models/GPT-Neo")

### Generate Text

In [27]:
generate_from_model(model_neo, tokenizer_neo, "In view of the fading animals")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: In view of the fading animals
0:  In view of the fading animals from every He hestarkainers on�ed as I
1:  In view of the fading animals of
2:  In view of the fading animals,
3:  In view of the fading animals of
4:  In view of the fading animals his mind, by the
5:  In view of the fading animals,
6:  In view of the fading animals;
7:  In view of the fading animals on us with no eye,
8:  In view of the fading animals,
9:  In view of the fading animals's Now


In [28]:
generate_from_model(model, tokenizer, "Shall I compare thee")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Shall I compare thee
0:  Shall I compare thee, my Godhead with thee;
1:  Shall I compare thee with his life
2:  Shall I compare thee now.”
3:  Shall I compare thee with him, which he
4:  Shall I compare thee?--the son of Men.
5:  Shall I compare thee? Shall it give thy face
6:  Shall I compare thee: a son born of a lord
7:  Shall I compare thee with him alone, then in peace thou hast hast?
8:  Shall I compare thee to the king's wrath?
9:  Shall I compare thee unto us:--_It be;


In [29]:
generate_from_model(model, tokenizer, "Deep into that darkness")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness
0:  Deep into that darkness that never sees--and knows--."
1:  Deep into that darkness."  He paused then:
2:  Deep into that darkness, with the sea waves,
3:  Deep into that darkness, and his blood a mist in her,
4:  Deep into that darkness the darkness. I stand;
5:  Deep into that darkness in that sky?...
6:  Deep into that darkness
7:  Deep into that darkness,--who at this moment
8:  Deep into that darkness; and this life
9:  Deep into that darkness


In [30]:
long_prompt = 'Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I'

generate_from_model(model, tokenizer, long_prompt)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
0:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I
1:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence was unbroken, And the stillness gave no token, And the only word there spoken Was the whispered word, "Lenore!" This I.
2:  Deep into that darkness peering, Long I stood there, wondering, fearing, Doubting, dreaming dreams no mortals Ever dared to dream before; But the silence w

In [44]:
generate_from_model(model_neo, tokenizer_neo, "Finished")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Finished
0:  Finished to-day, but thy word the the
1:  Finisheds, whose fings'er not was the seare of gold,
2:  Finished of each country is
3:  Finished; was of thy's y at that I say--
4:  Finished, but and his own with a dream of her, " was said, I went." it is done.--_
5:  Finished thou for dainei, is no
6:  Finished, and so had no gild-man to me.
7:  Finished I'd on the world which in a bl the st
8:  Finished that is a while that wrees
9:  Finished in in his eyes from the morning to know of  He I have I say--


In [37]:
generate_from_model(model, tokenizer, "Finished")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Outputs for: Finished
0:  Finished from me the way the others are;--
1:  Finished them round and stood with ease to walk
2:  Finished thy song--this my Lord
3:  Finished, and his hair is all pink as with hair;
4:  Finished, and for a while his tongue so loud had
5:  Finished one little day and the night became;
6:  Finished and bound at the head
7:  Finished up, like the sea aflame. But never left a trace
8:  Finished: about four pounds,
9:  Finished the journey, or in vain, with his eye
