# File structures comparison

This notebook can be used to compare different formats of the input data, like verse tagging, how are the poems grouped into sequences, ..., by looking at the validation loss.

Set `run_as_standalone_nb = True` if you are running this notebook outside of a clone of its repository (https://github.com/Poems-AI/AI.git). For example, in a Colab or Kaggle notebook.

In [None]:
run_as_standalone_nb = False


from pathlib import Path


if run_as_standalone_nb:
    import sys    
    root_lib_path = Path('AI').resolve()
    if not root_lib_path.exists():
        !git clone https://github.com/Poems-AI/AI.git
    if str(root_lib_path) not in sys.path:
        sys.path.insert(0, str(root_lib_path))
    
    !pip install -r {root_lib_path/'requirements.txt'}
    !apt-get install git-lfs
    !git lfs install
else:
    import local_lib_import

In [None]:
from datasets import load_dataset
from enum import auto, Enum
from functools import partial
from happytransformer import fine_tuning_util, HappyGeneration
from huggingface_hub import login, notebook_login
import math
import numpy as np
import os
import pandas as pd
import re
import transformers
from transformers import default_data_collator, Trainer, TrainingArguments
from transformers.optimization import SchedulerType
from transformers.trainer_utils import get_last_checkpoint
import torch
import torch.nn.functional as F
from typing import List, Tuple

In [None]:
from poemsai.data import Lang, lang_to_str, PoemsFileConfig
from poemsai.hf_utils import model_to_url
from poemsai.metrics import (compute_lm_accuracy, get_compute_metrics_metadataless, MetadataLessLoss, 
                             preprocess_logits_for_accuracy, preprocess_logits_for_metadataless_loss)
from poemsai.nb_utils import commit_checkpoint_to_hf_hub, download_checkpoint_from_hf_hub
from poemsai.tokenization import add_special_token
from poemsai.trainer import PoemsTrainer

Clone our datasets repo:

In [None]:
!git clone https://github.com/Poems-AI/dataset.git

# Log in to HuggingFace

In [None]:
HF_USER = 'YOUR_HF_USER'

**Option 1: notebook_login.**

In [None]:
notebook_login()

**Option 2: get token.** Unfortunately, you need to manually set your password. Every time you push to hub, you'll need to pass `use_auth_token=login_token`

In [None]:
pwd = 'YOUR_HF_PASSWORD'
login_token = login(HF_USER, pwd)
pwd = None

**Option 3 (recommended): interact with the git repo that stores your model** and pass the password every time you commit
<br><br>
Before commiting, you need to tell git your user and email (from HuggingFace)

In [None]:
HF_EMAIL = "YOUR_HF_EMAIL"
!git config --global user.email $HF_EMAIL
!git config --global user.name $HF_USER

You can push to hub by calling `commit_checkpoint_to_hub`. For instance:
```
commit_checkpoint_to_hub('gpt2-poems.en', HF_USER, './checkpoints/checkpoint-7170', 
                         message='Update model after 50 epochs', pwd='your_hf_password')
```
Be aware that this will copy everything from the checkpoint to the repository. If you need more control,
clone the model repository and interact with it like with any other GitHub repository. You can get the url of the
model repository with `model_to_url`

# Data

Set the language of the poems generator (`Lang.English` or `Lang.Spanish`)

In [None]:
lang = Lang.English
lang_str = lang_to_str(lang)

In [None]:
def get_datasets(txt_paths, tokenizer, num_procs=1):
    train_path, eval_path = txt_paths
    dataset = load_dataset("text", data_files={"train": train_path, "eval": eval_path})
    tokenized_dataset = fine_tuning_util.preprocess_concatenate(tokenizer, dataset, num_procs, mlm=False)
    return tokenized_dataset

# Tokenizer

In [None]:
def setup_tokenizer(tokenizer, file_config, model):
    bov_token_id, eov_token_id, eop_token_id = None, None, None
    
    if file_config.beginning_of_verse_token != '':
        bov_token_id = add_special_token(file_config.beginning_of_verse_token,
                                         tokenizer,
                                         model,
                                         copy_from='')    
    if file_config.end_of_verse_token != '':
        eov_token_id = add_special_token(file_config.end_of_verse_token,
                                         tokenizer,
                                         model,
                                         copy_from='\n')
    if file_config.end_of_poem_token != '':
        eop_token_id = add_special_token(file_config.end_of_poem_token,
                                         tokenizer,
                                         model,
                                         copy_from=tokenizer.eos_token)
        
    return bov_token_id, eov_token_id, eop_token_id

# Training

In [None]:
def freeze(params):
    for p in params: p.requires_grad = False

Create a directory to store the checkpoints automatically saved by the trainer:

In [None]:
!mkdir checkpoints

In [None]:
def build_trainer(model, tokenizer, datasets, n_epochs, bs=1, output_path='./checkpoints', optimizers=(None, None),
                  compute_metrics=None, preprocess_logits_for_metrics=None, **train_kwargs):
    training_args = TrainingArguments(
        output_dir=output_path,
        evaluation_strategy=transformers.trainer_utils.IntervalStrategy.EPOCH,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        num_train_epochs=n_epochs,
        save_strategy=transformers.trainer_utils.IntervalStrategy.EPOCH,
        report_to=["none"],
        fp16=False,
        **train_kwargs
    )
    trainer = PoemsTrainer(
        model=model,
        args=training_args,
        train_dataset=datasets['train'],
        eval_dataset=datasets['eval'],
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        optimizers=optimizers,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )
    return trainer

In [None]:
orig_model_name = "gpt2"

# TR 1: simple file structure

The only extra characters present in the inputs are line breaks at the end of each verse.

Set `resume_training = True` to continue training your own model from a previous checkpoint.

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/simple/all_poems.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/simple/all_poems.valid.{lang_str}.txt"
custom_model_name = f"gpt2-poems-simple.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HF_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), happy_gen.tokenizer, num_procs=1)
datasets

## TR 1a: all layers unfrozen since the beginning

In [None]:
n_epochs = 50
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
trainer = build_trainer(happy_gen.model, happy_gen.tokenizer, datasets, n_epochs, bs=bs, learning_rate=5e-5)

In [None]:
(trainer.args.num_train_epochs, trainer.args.max_steps, trainer.args.evaluation_strategy, trainer.args.eval_steps, trainer.args.save_strategy, 
trainer.args.learning_rate, trainer.args.lr_scheduler_type)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints

In [None]:
custom_model_name, get_last_checkpoint('./checkpoints')

Don't forget to set your Hugging Face password before commiting:

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 50 epochs', pwd='YOUR_HF_PASSWORD')

In [None]:
# happy_gen.tokenizer.push_to_hub(model_url, use_auth_token=login_token)
# happy_gen.model.push_to_hub(model_url, use_auth_token=login_token)

## Tr 1b: everything frozen but head

In [None]:
n_epochs = 50
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
trainer = build_trainer(happy_gen.model, happy_gen.tokenizer, datasets, n_epochs, bs=bs, learning_rate=5e-5)

In [None]:
(trainer.args.num_train_epochs, trainer.args.max_steps, trainer.args.evaluation_strategy, trainer.args.eval_steps, 
 trainer.args.save_strategy, trainer.args.learning_rate, trainer.args.lr_scheduler_type)

In [None]:
len(list(happy_gen.model.parameters())), sum([p.requires_grad for p in happy_gen.model.parameters()])
# > 148, 148

Weights of input embedding and head linear layer are tied, so if we train the head we are forced to train the embedding too

In [None]:
(happy_gen.model.transformer.wte.weight.shape, happy_gen.model.lm_head.weight.shape, 
 id(happy_gen.model.transformer.wte.weight) - id(happy_gen.model.lm_head.weight),
 torch.allclose(happy_gen.model.transformer.wte.weight, happy_gen.model.lm_head.weight))

Now, we freeze everything but head and input embedding:

In [None]:
freeze(p for p in happy_gen.model.parameters() if id(p) != id(happy_gen.model.lm_head.weight))
sum([p.requires_grad for p in happy_gen.model.parameters()]), happy_gen.model.lm_head.weight.requires_grad
# Expected output: (1, True)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 50 epochs', pwd='YOUR_HF_PASSWORD')

In [None]:
# happy_gen.tokenizer.push_to_hub(model_url, use_auth_token=login_token)
# happy_gen.model.push_to_hub(model_url, use_auth_token=login_token)

## Tr 1c: discriminative learning rates

The fact the input embedding is tied to the output linear layer probably makes this strategy less interesting, given that:
- The final linear layer needs to have one of the highest learning rates. 
- If this is the case, the input embedding has a relatively high learning rate too (the same as the linear layer).
- Then, it doesn't make sense for the early layers to learn slowly when the layer that feeds them is changing quickly.

In [None]:
def create_opt_disc_lrs(model, min_lr, max_lr, head_lr):
    n_blocks = len(happy_gen.model.transformer.h)
    lr_mult = (max_lr / min_lr) ** (1 / (n_blocks - 1))
    blocks_lrs = [min_lr * lr_mult ** i for i in range(n_blocks)]
    blocks_params = [{'params': happy_gen.model.transformer.h[i].parameters(), 'lr': blocks_lrs[i]}
                     for i in range(n_blocks)]
    return torch.optim.AdamW([
        {'params': model.transformer.wpe.parameters(), 'lr': min_lr},
        {'params': model.transformer.ln_f.parameters(), 'lr': min_lr},
        {'params': model.lm_head.parameters()},#, 'lr': head_lr},
        *blocks_params
    ], lr=head_lr, weight_decay=0, betas=(0.9, 0.999))

In [None]:
n_epochs = 50
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
opt = create_opt_disc_lrs(happy_gen.model, 1e-7, 1e-4, 1e-4)
#sched = transformers.get_constant_schedule_with_warmup(opt, n_steps_by_epoch//2, last_epoch=-1)
trainer = build_trainer(happy_gen.model, happy_gen.tokenizer, datasets, n_epochs, bs=bs, optimizers=(opt, None))

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 50 epochs', pwd='YOUR_HF_PASSWORD')

In [None]:
# happy_gen.tokenizer.push_to_hub(model_url, use_auth_token=login_token)
# happy_gen.model.push_to_hub(model_url, use_auth_token=login_token)

# Tr 2: tag end of poems

Only special tags are line breaks at the end of each verse and "<|endoftext|>" at the end of each poem.

Set `resume_training = True` to continue training your own model from a previous checkpoint.

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.valid.{lang_str}.txt"
custom_model_name = f"gpt2-poems-endtags.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HF_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(happy_gen.tokenizer, 
                                                            file_config, 
                                                            happy_gen.model)

our_special_tokens = (file_config.end_of_verse_token,
                      file_config.beginning_of_verse_token,
                      file_config.end_of_poem_token,)
our_special_tokens = [t for t in our_special_tokens if t != '']

assert set(our_special_tokens).issubset(happy_gen.tokenizer.all_special_tokens), (
       f'{our_special_tokens} != {happy_gen.tokenizer.additional_special_tokens}'
)
assert len(happy_gen.tokenizer.encode(''.join(our_special_tokens))) == len(our_special_tokens)
(happy_gen.tokenizer.additional_special_tokens, 
 happy_gen.tokenizer.additional_special_tokens_ids, 
 happy_gen.tokenizer.all_special_tokens, 
 begin_verse_id, end_verse_id, end_poem_id)

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), happy_gen.tokenizer, num_procs=1)
datasets

Let's see an example of the decoding of a an already tokenized sample. If everything is ok, '\\n' should appear at the end of each verse and '<|endoftext|>' at the end of each poem.

In [None]:
happy_gen.tokenizer.decode(datasets['train'][0]['input_ids'])

For validation we'll use a modified version of the cross entropy loss that doesn't take into account the metadata inserted into the poems, like the end of poem tag.

In [None]:
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id)
trainer = build_trainer(happy_gen.model, happy_gen.tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints

In [None]:
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 50 epochs', pwd='YOUR_HF_PASSWORD')

In [None]:
# happy_gen.tokenizer.push_to_hub(model_url, use_auth_token=login_token)
# happy_gen.model.push_to_hub(model_url, use_auth_token=login_token)

# Tr 3: include all tags and end of previous verses

This version of the input files includes:
* Beginning of verse tag
* End of verse tag
* End of poem tag
* The last word of each of the 4 previous verses is placed right before the beginning of verse tag

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/all_tags_4_prev_verses/all_poems.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/all_tags_4_prev_verses/all_poems.valid.{lang_str}.txt"
custom_model_name = f"gpt2-poems-alltags-4prev-verses.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HF_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(happy_gen.tokenizer, 
                                                            file_config, 
                                                            happy_gen.model)

our_special_tokens = (file_config.end_of_verse_token,
                      file_config.beginning_of_verse_token,
                      file_config.end_of_poem_token,)
our_special_tokens = [t for t in our_special_tokens if t != '']

assert set(our_special_tokens).issubset(happy_gen.tokenizer.all_special_tokens), (
       f'{our_special_tokens} != {happy_gen.tokenizer.additional_special_tokens}'
)
assert len(happy_gen.tokenizer.encode(''.join(our_special_tokens))) == len(our_special_tokens)
(happy_gen.tokenizer.additional_special_tokens, 
 happy_gen.tokenizer.additional_special_tokens_ids, 
 happy_gen.tokenizer.all_special_tokens, 
 begin_verse_id, end_verse_id, end_poem_id)

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), happy_gen.tokenizer, num_procs=1)
datasets

Let's see an example of the decoding of a an already tokenized sample. If everything is ok:
* '\<bov\>' should appear at the end of each verse 
* '\\\n' should appear at the end of each verse 
* '<|endoftext|>' should appear at the end of each poem.
* The termination of the 4 previous verses should appear after each '\<bov\>' token

In [None]:
happy_gen.tokenizer.decode(datasets['train'][0]['input_ids'])

For validation we'll use a modified version of the cross entropy loss that doesn't take into account the metadata inserted into the poems, like the end of poem tag or the terminations of previous verses.

In [None]:
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id)
trainer = build_trainer(happy_gen.model, happy_gen.tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints

In [None]:
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 50 epochs', pwd='YOUR_HF_PASSWORD')

In [None]:
# happy_gen.tokenizer.push_to_hub(model_url, use_auth_token=login_token)
# happy_gen.model.push_to_hub(model_url, use_auth_token=login_token)