In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import shutil
from torch.utils.data import Dataset, random_split
from transformers import Trainer, TrainingArguments, GPTNeoForCausalLM, GPT2Tokenizer
from google.colab import drive

In [None]:
!wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt

--2023-04-11 21:33:08--  https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94275 (92K) [text/plain]
Saving to: ‘shakespeare.txt’


2023-04-11 21:33:08 (9.41 MB/s) - ‘shakespeare.txt’ saved [94275/94275]



In [None]:
# Read the text file and returns list of lines in text
def read_file(file_path):
    with open(file_path) as f:
        lines = [line for line in f]
        # lines.remove("")
    return lines


In [None]:
file_path = "/content/shakespeare.txt"

texts = read_file(file_path)
sonnets = []
sonnet = []
for text in texts:
  if len(text)>1:
    sonnet.append(text)
  else:
    sonnets.append(''.join(sonnet))
    sonnet = []

# Remove unnecessary texts


In [None]:
  # Prepare sonnets
datas = sonnets[2:-1]
print(len(datas))
for data in datas:
    if len(data)<1:
      datas.remove(data)
print(len(datas))

289
212


In [None]:
# Custome dataset class to load dataset
class ShakespeareDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>' 
                                        + txt +    
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length, 
                                            padding="max_length")
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
# Set the random seed to a fixed value to get reproducible results 
torch.manual_seed(42)

# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end 
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M",    
                            bos_token='<|startoftext|>',
                            eos_token='<|endoftext|>',
                            pad_token='<|pad|>')

# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").cuda()

# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

Embedding(50259, 768)

In [None]:
max_length = max([len(tokenizer.encode(sonnet)) for sonnet in datas])

# Load dataset
dataset = ShakespeareDataset(sonnets, tokenizer, max_length)

# Split data into train/val
train_size = int(0.9 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])

max_length

351

In [None]:
tokenizer.batch_decode(val_data[4])

In [None]:
# Here I will pass the output directory where 
# the model predictions and checkpoints will be stored, 
# batch sizes for the training and validation steps, 
# and warmup_steps to gradually increase the learning rate
learning_rates = [5e-5, 3e-5, 1e-5]

for learning_rate in learning_rates:

    training_args = TrainingArguments(output_dir=f'./results_{learning_rate}',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=1000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=learning_rate,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs_{learning_rate}')

    trainer = Trainer(model=model, args=training_args,  
                      train_dataset=train_data,
                      eval_dataset=val_data, 
                      # This custom collate function is necessary 
                      # to built batches of data
                      data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
                  'attention_mask': torch.stack([f[1] for f in data]),
                  'labels': torch.stack([f[0] for f in data])})

    # Start training process!
    print(f"Training result for learning rate: {learning_rate}")
    trainer.train()
    print("\n\n")

Training result for learning rate: 5e-05




Step,Training Loss,Validation Loss





Training result for learning rate: 3e-05


Step,Training Loss,Validation Loss





Training result for learning rate: 1e-05


Step,Training Loss,Validation Loss







In [None]:
training_args = TrainingArguments(output_dir=f'./results',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=5000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=5e-5,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs')

trainer = Trainer(model=model, args=training_args,  
                  train_dataset=train_data,
                  eval_dataset=val_data, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
              'attention_mask': torch.stack([f[1] for f in data]),
              'labels': torch.stack([f[0] for f in data])})

# Start training process!
trainer.train()




Step,Training Loss,Validation Loss


TrainOutput(global_step=655, training_loss=1.0541496451574428, metrics={'train_runtime': 154.0388, 'train_samples_per_second': 8.504, 'train_steps_per_second': 4.252, 'total_flos': 234581319198720.0, 'train_loss': 1.0541496451574428, 'epoch': 5.0})

In [None]:
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                # bos_token='<|startoftext|>',
                                # eos_token='<|endoftext|>', pad_token='<|pad|>',
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 
1: 
2: Like as an ethereal rose or angel, a heavenly presence and heavenly eyes can't wait to come after one who so calls, with unendowed grace, the beauty of another by night the rose that blooses on day forth as if every bud datively lies waiting to catch the fruit of his loving kindness when a kiss brings it forth to flower anew
The heavenly majesty's kiss in this alone shows that other than nature itself alone hath a constant hold upon one whom love lends it comfort and holds both at their side
Or for it proves false
Presents heavenly praise as from another 'tis best to be false
If we were to be like our nativity each time seeing what beauty affable looks of, was converted ere she gave a sign that she saw this to stand alone and give light to beauty and beauty above to night it would stand
That it could never stay this constant eye.
For what is it by what it
3: In the second case where possession is made and
uncontented discontent is expressed through him, I am willing you woul

In [None]:
trainer.save_model("/content/model")

In [None]:
import os
os.mkdir('/content/model')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from huggingface_hub import HfApi

api = HfApi()

In [None]:
api.create_repo(repo_id="NLP")

In [None]:
# Upload your model to huggingface. You can clone the repo anytime to use the model.
import os

model_pth = "/content/model"

files = os.listdir(model_pth)
for fi in files:
    print(os.path.join(model_pth, fi))

    api.upload_file(
        path_or_fileobj=os.path.join(model_pth, fi),
        path_in_repo=fi,
        repo_id="sunilrufus/NLP",
        repo_type="model",
    )

/content/model/config.json
/content/model/pytorch_model.bin


pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

/content/model/training_args.bin


training_args.bin:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

/content/model/generation_config.json
