# Project Part 3

In [2]:
# imports

import pandas as pd
import torch 
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

In [3]:
# load dataset
dataset = load_dataset("biglam/gutenberg-poetry-corpus", split="train")

Using custom data configuration cakiki--gutenberg-poetry-corpus-7745b6aecdad34dc
Found cached dataset parquet (C:/Users/Shayne Kaiser/.cache/huggingface/datasets/biglam___parquet/cakiki--gutenberg-poetry-corpus-7745b6aecdad34dc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [4]:
dataset

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 3085117
})

In [5]:
# convert to Pandas
df = pd.DataFrame(dataset)

In [6]:
print('df shape:', df.shape)

df shape: (3085117, 2)


In [7]:
lines = df["line"]
lines.head()

0    The Song of Hiawatha is based on the legends a...
1    many North American Indian tribes, but especia...
2    Ojibway Indians of northern Michigan, Wisconsi...
3    They were collected by Henry Rowe Schoolcraft,...
4    Schoolcraft married Jane, O-bah-bahm-wawa-ge-z...
Name: line, dtype: object

In [17]:
max_length = max([len(tokenizer.encode(line)) for line in lines])

In [18]:
max_length

85

In [8]:
class PoetryDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [20]:
tokenization_dataset = PoetryDataset(lines, tokenizer, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset, val_dataset = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [21]:
train_dataset[0]

(tensor([50257,   818,   262,  7032,   262,   302,   521, 28153,  4836,   606,
         11496, 50256, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [22]:
print(f"Length of Training dataset: {len(train_dataset)}")
print(f"Length of Validation dataset: {len(val_dataset)}")

Length of Training dataset: 2776605
Length of Validation dataset: 308512


## DistilGPT2

In [4]:
# load in model
torch.manual_seed(92)

MODEL_NAME = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

In [None]:
training_args = TrainingArguments(output_dir='./Models/DistilGPT2', num_train_epochs=1, logging_steps=10000, save_steps=500000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=10, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/DistilGPT2/logs', report_to='none' )

### Freezing layers

In [24]:
# check layers in the model
for name, param in model.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.c_attn.weight True
transformer.h.0.attn.c_attn.bias True
transformer.h.0.attn.c_proj.weight True
transformer.h.0.attn.c_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.c_attn.weight True
transformer.h.1.attn.c_attn.bias True
transformer.h.1.attn.c_proj.weight True
transformer.h.1.attn.c_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True
transformer.h.1.mlp.c_fc.bias True
transformer.h.1.mlp.c_proj.weight True
transformer.h.1.mlp.c_proj.bias True
transformer.h.2.ln_1.weight True
transformer.h.2.ln_1.bias True
transformer.h.2.

In [27]:
# freeze the first two layers and 4 hidden units
for name, param in model.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(5)]):
        param.requires_grad = False

In [28]:
# now check layers
for name, param in model.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.c_attn.weight False
transformer.h.0.attn.c_attn.bias False
transformer.h.0.attn.c_proj.weight False
transformer.h.0.attn.c_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.c_attn.weight False
transformer.h.1.attn.c_attn.bias False
transformer.h.1.attn.c_proj.weight False
transformer.h.1.attn.c_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.h.1.mlp.c_fc.weight False
transformer.h.1.mlp.c_fc.bias False
transformer.h.1.mlp.c_proj.weight False
transformer.h.1.mlp.c_proj.bias False
transformer.h.2.ln_1.weight False
transformer.h.2.ln_1

### Train the Model

In [None]:

Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()



In [31]:
# Save model and Tokenizer
model.save_pretrained("./Models/DistilGPT2")

Configuration saved in ./Models/DistilGPT2\config.json
Model weights saved in ./Models/DistilGPT2\pytorch_model.bin


In [32]:
tokenizer.save_pretrained("./Models/DistilGPT2")

tokenizer config file saved in ./Models/DistilGPT2\tokenizer_config.json
Special tokens file saved in ./Models/DistilGPT2\special_tokens_map.json


('./Models/DistilGPT2\\tokenizer_config.json',
 './Models/DistilGPT2\\special_tokens_map.json',
 './Models/DistilGPT2\\vocab.json',
 './Models/DistilGPT2\\merges.txt',
 './Models/DistilGPT2\\added_tokens.json',
 './Models/DistilGPT2\\tokenizer.json')

In [None]:
# load in saved model and tokenizer 
tokenizer = AutoTokenizer.from_pretrained("./Models/DistilGPT2")
model = AutoModelForCausalLM.from_pretrained("./Models/DistilGPT2")

In [3]:
import torch

print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {torch.cuda.current_device()}")
		
print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")


Is CUDA supported by this system? True
CUDA version: 11.7
ID of current CUDA device: 0
Name of current CUDA device: NVIDIA GeForce RTX 3070


## GPT-Neo

In [9]:
# load in model
torch.manual_seed(92)

MODEL_NAME = "EleutherAI/gpt-neo-1.3B"

tokenizer_neo = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model_neo = AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model_neo.resize_token_embeddings(len(tokenizer_neo))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 2048)

In [10]:
# Traing arguments
training_args_neo = TrainingArguments(output_dir='./Models/GPT-Neo', num_train_epochs=1, logging_steps=10000, save_steps=500000,
                                  per_device_train_batch_size=10, per_device_eval_batch_size=10, warmup_steps=10,
                                   weight_decay=0.05, logging_dir='./Models/GPT-Neo/logs', report_to='none' )

### Tokenize data

In [11]:
max_length = max([len(tokenizer_neo.encode(line)) for line in lines])

In [12]:
max_length

85

In [13]:
tokenization_dataset = PoetryDataset(lines, tokenizer_neo, max_length=max_length)
train_size = int(0.9  * len(tokenization_dataset))
train_dataset_neo, val_dataset_neo = random_split(tokenization_dataset, [train_size, len(tokenization_dataset) - train_size])

In [14]:
train_dataset_neo[0]

(tensor([50257,  3152,   257,  9480,  2786,   273,    11,   543,    11,   996,
           284,   262,  4151, 50256, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [15]:
print(f"Length of Training dataset: {len(train_dataset_neo)}")
print(f"Length of Validation dataset: {len(val_dataset_neo)}")

Length of Training dataset: 2776605
Length of Validation dataset: 308512


### Freeze layers

In [16]:
for name, param in model_neo.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight True
transformer.wpe.weight True
transformer.h.0.ln_1.weight True
transformer.h.0.ln_1.bias True
transformer.h.0.attn.attention.k_proj.weight True
transformer.h.0.attn.attention.v_proj.weight True
transformer.h.0.attn.attention.q_proj.weight True
transformer.h.0.attn.attention.out_proj.weight True
transformer.h.0.attn.attention.out_proj.bias True
transformer.h.0.ln_2.weight True
transformer.h.0.ln_2.bias True
transformer.h.0.mlp.c_fc.weight True
transformer.h.0.mlp.c_fc.bias True
transformer.h.0.mlp.c_proj.weight True
transformer.h.0.mlp.c_proj.bias True
transformer.h.1.ln_1.weight True
transformer.h.1.ln_1.bias True
transformer.h.1.attn.attention.k_proj.weight True
transformer.h.1.attn.attention.v_proj.weight True
transformer.h.1.attn.attention.q_proj.weight True
transformer.h.1.attn.attention.out_proj.weight True
transformer.h.1.attn.attention.out_proj.bias True
transformer.h.1.ln_2.weight True
transformer.h.1.ln_2.bias True
transformer.h.1.mlp.c_fc.weight True

In [17]:
# freeze the first two layers and 23 hidden units
for name, param in model_neo.named_parameters():
    if name.startswith("transformer.wte"):
        param.requires_grad = False
    if name.startswith("transformer.wpe"):
        param.requires_grad = False
    if any(x in name for x in ['.' + str(x) + '.' for x in range(23)]):
        param.requires_grad = False

In [18]:
# check layers
for name, param in model_neo.named_parameters():
    print(name, param.requires_grad)

transformer.wte.weight False
transformer.wpe.weight False
transformer.h.0.ln_1.weight False
transformer.h.0.ln_1.bias False
transformer.h.0.attn.attention.k_proj.weight False
transformer.h.0.attn.attention.v_proj.weight False
transformer.h.0.attn.attention.q_proj.weight False
transformer.h.0.attn.attention.out_proj.weight False
transformer.h.0.attn.attention.out_proj.bias False
transformer.h.0.ln_2.weight False
transformer.h.0.ln_2.bias False
transformer.h.0.mlp.c_fc.weight False
transformer.h.0.mlp.c_fc.bias False
transformer.h.0.mlp.c_proj.weight False
transformer.h.0.mlp.c_proj.bias False
transformer.h.1.ln_1.weight False
transformer.h.1.ln_1.bias False
transformer.h.1.attn.attention.k_proj.weight False
transformer.h.1.attn.attention.v_proj.weight False
transformer.h.1.attn.attention.q_proj.weight False
transformer.h.1.attn.attention.out_proj.weight False
transformer.h.1.attn.attention.out_proj.bias False
transformer.h.1.ln_2.weight False
transformer.h.1.ln_2.bias False
transformer.

### Train the model

In [19]:
Trainer(model=model_neo,  args=training_args_neo, train_dataset=train_dataset_neo, 
        eval_dataset=val_dataset_neo, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 2776605
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 277661
  Number of trainable parameters = 50356224


  0%|          | 0/277661 [00:00<?, ?it/s]

{'loss': 0.8757, 'learning_rate': 4.8200978926782185e-05, 'epoch': 0.04}
{'loss': 0.8238, 'learning_rate': 4.640015703166925e-05, 'epoch': 0.07}
{'loss': 0.8112, 'learning_rate': 4.459933513655633e-05, 'epoch': 0.11}
{'loss': 0.8019, 'learning_rate': 4.27985132414434e-05, 'epoch': 0.14}
{'loss': 0.7945, 'learning_rate': 4.099769134633047e-05, 'epoch': 0.18}
{'loss': 0.7902, 'learning_rate': 3.919686945121754e-05, 'epoch': 0.22}
{'loss': 0.7844, 'learning_rate': 3.739604755610461e-05, 'epoch': 0.25}
{'loss': 0.7811, 'learning_rate': 3.559522566099168e-05, 'epoch': 0.29}
{'loss': 0.7778, 'learning_rate': 3.3794403765878745e-05, 'epoch': 0.32}
{'loss': 0.7751, 'learning_rate': 3.199358187076582e-05, 'epoch': 0.36}
{'loss': 0.7725, 'learning_rate': 3.019275997565289e-05, 'epoch': 0.4}
{'loss': 0.7696, 'learning_rate': 2.839193808053996e-05, 'epoch': 0.43}
{'loss': 0.7676, 'learning_rate': 2.659111618542703e-05, 'epoch': 0.47}
{'loss': 0.7668, 'learning_rate': 2.47902942903141e-05, 'epoch':



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 123748.8893, 'train_samples_per_second': 22.437, 'train_steps_per_second': 2.244, 'train_loss': 0.7741056002052638, 'epoch': 1.0}


TrainOutput(global_step=277661, training_loss=0.7741056002052638, metrics={'train_runtime': 123748.8893, 'train_samples_per_second': 22.437, 'train_steps_per_second': 2.244, 'train_loss': 0.7741056002052638, 'epoch': 1.0})

In [20]:
# Save model and Tokenizer
model_neo.save_pretrained("./Models/GPT-Neo")

Configuration saved in ./Models/GPT-Neo\config.json
Model weights saved in ./Models/GPT-Neo\pytorch_model.bin


In [21]:
tokenizer_neo.save_pretrained("./Models/GPT-Neo")

tokenizer config file saved in ./Models/GPT-Neo\tokenizer_config.json
Special tokens file saved in ./Models/GPT-Neo\special_tokens_map.json


('./Models/GPT-Neo\\tokenizer_config.json',
 './Models/GPT-Neo\\special_tokens_map.json',
 './Models/GPT-Neo\\vocab.json',
 './Models/GPT-Neo\\merges.txt',
 './Models/GPT-Neo\\added_tokens.json',
 './Models/GPT-Neo\\tokenizer.json')

In [22]:
# load in saved model and tokenizer 
tokenizer_neo = AutoTokenizer.from_pretrained("./Models/GPT-Neo")
model_neo = AutoModelForCausalLM.from_pretrained("./Models/GPT-Neo")

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./Models/GPT-Neo\config.json
Model config GPTNeoConfig {
  "_name_or_path": "./Models/GPT-Neo",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      12
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 2048,
  "