# Training of conditional generators

This notebook trains and experiments with GPT2-based conditional text generators using different configurations: labeling strategies, loss functions, architectures, ...

We use two types of labels to condition the generation: form and topic.

Set `run_as_standalone_nb = True` if you are running this notebook outside of a clone of its repository (https://github.com/Poems-AI/AI.git). For example, in a Colab or Kaggle notebook.

In [None]:
run_as_standalone_nb = True


from pathlib import Path


if run_as_standalone_nb:
    import sys    
    root_lib_path = Path('AI').resolve()
    if not root_lib_path.exists():
        !git clone https://github.com/Poems-AI/AI.git
    if str(root_lib_path) not in sys.path:
        sys.path.insert(0, str(root_lib_path))

    !pip install -r {root_lib_path/'requirements.txt'}
    !apt-get install git-lfs
    !git lfs install
else:
    import local_lib_import

In [None]:
from datasets import concatenate_datasets, DatasetDict, load_dataset, load_metric
from enum import auto, Enum
from functools import partial
from happytransformer import fine_tuning_util, GENEvalArgs, GENSettings, GENTrainArgs, HappyGeneration
from huggingface_hub import login, notebook_login
import math
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
import tempfile
import transformers
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer, default_data_collator, Trainer, TrainingArguments
)
from transformers.optimization import SchedulerType
from transformers.trainer_utils import get_last_checkpoint
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List, Tuple
import warnings

In [None]:
from poemsai.data import LabelsType, LabelsDecoderExplained, Lang, lang_to_str, PoemsFileConfig
from poemsai.hf_utils import model_to_url
from poemsai.metrics import (
    ConditionalGenLoss, get_compute_metrics_metadataless, MetadataLessLoss, 
    preprocess_logits_for_metadataless_loss
)
from poemsai.nb_utils import commit_checkpoint_to_hf_hub, download_checkpoint_from_hf_hub
from poemsai.tokenization import add_special_token
from poemsai.trainer import PoemsTrainer

Clone our datasets repo:

In [None]:
!git clone https://github.com/Poems-AI/dataset.git

# Log in to HuggingFace

In [None]:
HF_USER = "YOUR_HUGGINGFACE_USER"

**Option 1: notebook_login.**

In [None]:
notebook_login()

**Option 2: get token.** Unfortunately, you need to manually set your password. Every time you push to hub, you'll need to pass `use_auth_token=login_token`

In [None]:
pwd = ''
login_token = login(HF_USER, pwd)
pwd = None

**Option 3 (recommended): interact with the git repo that stores your model** and pass the password every time you commit
<br><br>
Before commiting, you need to tell git your user and email (from HuggingFace)

In [None]:
HF_EMAIL = "YOUR_HUGGINGFACE_EMAIL"
!git config --global user.email $HF_EMAIL
!git config --global user.name $HF_USER

You can push to hub by calling `commit_checkpoint_to_hub`. For instance:
```
commit_checkpoint_to_hub('gpt2-poems.en', HF_USER, './checkpoints/checkpoint-7170', 
                         message='Update model after 50 epochs', pwd='your_hf_password')
```
Be aware that this will copy everything from the checkpoint to the repository. If you need more control,
clone the model repository and interact with it like with any other GitHub repository. You can get the url of the
model repository with `model_to_url`

# Data

Currently, our conditional generators only work with English poems.

In [None]:
lang = Lang.English
lang_str = lang_to_str(lang)

In [None]:
def get_datasets(txt_paths, tokenizer, num_procs=1):
    train_path, eval_path = txt_paths
    dataset = load_dataset("text", data_files={"train": train_path, "eval": eval_path})
    tokenized_dataset = fine_tuning_util.preprocess_concatenate(tokenizer, dataset, num_procs, mlm=False)
    return tokenized_dataset

# Tokenizer

In [None]:
def setup_tokenizer(tokenizer, file_config, model):
    bov_token_id, eov_token_id, eop_token_id = None, None, None
    
    if file_config.beginning_of_verse_token != '':
        bov_token_id = add_special_token(file_config.beginning_of_verse_token,
                                         tokenizer,
                                         model,
                                         copy_from='')    
    if file_config.end_of_verse_token != '':
        eov_token_id = add_special_token(file_config.end_of_verse_token,
                                         tokenizer,
                                         model,
                                         copy_from='\n')
    if file_config.end_of_poem_token != '':
        eop_token_id = add_special_token(file_config.end_of_poem_token,
                                         tokenizer,
                                         model,
                                         copy_from=tokenizer.eos_token)
        
    return bov_token_id, eov_token_id, eop_token_id

# Training

In [None]:
!mkdir checkpoints

In [None]:
def build_trainer(model, tokenizer, datasets, n_epochs, bs=1, output_path='./checkpoints', optimizers=(None, None),
                  compute_metrics=None, preprocess_logits_for_metrics=None, extra_loss_fns=None, callbacks=None,
                  **train_kwargs):
    training_args = TrainingArguments(
        output_dir=output_path,
        evaluation_strategy=transformers.trainer_utils.IntervalStrategy.EPOCH, # STEPS,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        num_train_epochs=n_epochs,
        save_strategy=transformers.trainer_utils.IntervalStrategy.EPOCH, # defaults to "steps"  
        report_to=["none"],
        **train_kwargs
    )
    trainer = PoemsTrainer(
        model=model,
        args=training_args,
        train_dataset=datasets['train'],
        eval_dataset=datasets['eval'],
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        #data_collator_ = DataCollatorWithPadding(tokenizer=tokenizer)
        optimizers=optimizers,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
        extra_loss_fns=extra_loss_fns,
        callbacks=callbacks,
    )
    return trainer

In [None]:
orig_model_name = "gpt2"

# Tr 1: train with label inserted as an additional verse at the beginning of the poem

Some poems have only topic as label and the rest have only form as label, so we train a different model for each subset of poems.

In [None]:
resume_training = False

In [None]:
labels_type = LabelsType.Topics
labels_type_str = labels_type.value[:-1]

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_{labels_type_str}.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_{labels_type_str}.valid.{lang_str}.txt"
custom_model_name = f"gpt2-poems-cond-{labels_type_str}.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
# model = AutoModelForCausalLM.from_pretrained(orig_model_name, resid_pdrop=0.3)
# tokenizer = AutoTokenizer.from_pretrained(orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
(tokenizer.additional_special_tokens, 
 tokenizer.additional_special_tokens_ids, 
 tokenizer.all_special_tokens, 
 begin_verse_id, end_verse_id, end_poem_id)

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

In [None]:
debug_mode = False
if debug_mode:
    from datasets import DatasetDict
    datasets = DatasetDict({k: v.select([1, 2, 3]) for k, v in datasets.items()})
datasets

Let's see an example of the decoding of a an already tokenized sample. If everything is ok, '\\n' should appear at the end of each verse and '<|endoftext|>' at the end of each poem.

In [None]:
tokenizer.decode(datasets['train'][0]['input_ids'])

In [None]:
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
(trainer.args.num_train_epochs, trainer.args.max_steps, trainer.args.evaluation_strategy, trainer.args.eval_steps, trainer.args.save_strategy, 
trainer.args.learning_rate, trainer.args.lr_scheduler_type)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints

In [None]:
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, best at 4 epochs', pwd='')

# Tr 2: train with the available label inserted as an additional verse at the beginning of the poem

Some poems have only topic as label and the rest have only form as label, but here we train the same model for both.

In [None]:
resume_training = False

train_text_path1 = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_topic.train.{lang_str}.txt"
train_text_path2 = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form.train.{lang_str}.txt"
valid_text_path1 = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_topic.valid.{lang_str}.txt"
valid_text_path2 = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-topic-or-form.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path1).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
(tokenizer.additional_special_tokens, 
 tokenizer.additional_special_tokens_ids, 
 tokenizer.all_special_tokens, 
 begin_verse_id, end_verse_id, end_poem_id)

In [None]:
datasets_topic_labeled = get_datasets((train_text_path1, valid_text_path1), tokenizer, num_procs=1)
datasets_form_labeled = get_datasets((train_text_path2, valid_text_path2), tokenizer, num_procs=1)
datasets = DatasetDict({
    split: concatenate_datasets((datasets_topic_labeled[split], datasets_form_labeled[split]))
    for split in datasets_topic_labeled.keys()
})
datasets_topic_labeled, datasets_form_labeled, datasets

In [None]:
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=1)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints

In [None]:
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 6 epochs', pwd='')

# Tr 3: train with labels inserted as additional verses at the beginning of the poem, '?' verse for labels not available


In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-and-topic.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
(tokenizer.additional_special_tokens, 
 tokenizer.additional_special_tokens_ids, 
 tokenizer.all_special_tokens, 
 begin_verse_id, end_verse_id, end_poem_id)

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine a sample:

In [None]:
tokenizer.decode(datasets['train'][0]['input_ids'])

In [None]:
bs = 1
n_steps_by_epoch = len(datasets['train']) // bs
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=2)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 3 epochs', pwd='')

# Tr 4: train with labels inserted as an additional verse at the beginning of the poem, with key: value format

For instance:

form: sonnet, topic: ?\n<br>
Verse 1\n<br>
...

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_kv.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_kv.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-and-topic-kv.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", custom_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
tokenizer.all_special_tokens, begin_verse_id, end_verse_id, end_poem_id

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine a sample:

In [None]:
tokenizer.decode(datasets['train'][0]['input_ids'])

In [None]:
bs = 1
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=1)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Update model, 3 epochs (best valid score)', pwd='')

# Tr 5: train with a verse inserted for each label at the beginning of the poem, with key: value format

For instance:

form: sonnet \n<br>
topic: ? \n <br>
Verse 1 \n <br>
...

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_kv_mv.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_kv_mv.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-and-topic-kv-mv.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
tokenizer.all_special_tokens, begin_verse_id, end_verse_id, end_poem_id

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine a sample:

In [None]:
tokenizer.decode(datasets['train'][0]['input_ids'])

In [None]:
bs = 1
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=2)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 4 epochs', pwd='')

# Tr 6: train with a description of the labels at the beginning of the poem, '?' for labels not available

For instance (assume form is not available):

This is a poem with ? form about love: \n<br>
Verse 1 \n <br>
...

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-and-topic-exp.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
tokenizer.all_special_tokens, begin_verse_id, end_verse_id, end_poem_id

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine a sample:

In [None]:
tokenizer.decode(datasets['train'][0]['input_ids'])

In [None]:
bs = 1
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=1)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 3 epochs', pwd='')

# Tr 7: train with a description of the labels at the beginning of the poem, no description for categories not available

For instance (assume form is not available):

This is a poem about love: \n<br>
Verse 1 \n <br>
...

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp_s.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp_s.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-or-topic-exp-s.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
tokenizer.all_special_tokens, begin_verse_id, end_verse_id, end_poem_id

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine a sample:

In [None]:
tokenizer.decode(datasets['train'][1000]['input_ids'])

In [None]:
bs = 1
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=1)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, './checkpoints/checkpoint-13545', #get_last_checkpoint('./checkpoints'),
                            message='Add model, 3 epochs', pwd='')

# Tr 8: train with a description of the labels at the beginning of the poem, missing labels filled by classifiers

For instance (assume form is not available):

This is a poem with sonnet form about love: \n<br>
Verse 1 \n <br>
...

The missing labels are filled with the labels predicted by classifiers of each category.

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp_filled.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp_filled.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-and-topic-exp-filled.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
happy_gen = HappyGeneration("GPT2", orig_model_name)
model = happy_gen.model
tokenizer = happy_gen.tokenizer

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
tokenizer.all_special_tokens, begin_verse_id, end_verse_id, end_poem_id

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine a sample:

In [None]:
tokenizer.decode(datasets['train'][1000]['input_ids'])

In [None]:
bs = 1
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=1)
trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
)

In [None]:
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 4 epochs', pwd='')

# Tr 9: train with a description of the labels at the beginning of the poem, '?' for labels not available, conditional loss

In [None]:
resume_training = False

In [None]:
train_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp.train.{lang_str}.txt"
valid_text_path = f"dataset/all.txt/{lang_str}.txt/only_end_tags/all_poems.labeled.by_form_and_topic_exp.valid.{lang_str}.txt"

In [None]:
custom_model_name = f"gpt2-poems-cond-form-and-topic-exp-condloss.{lang_str}"
model_url = model_to_url(custom_model_name, HF_USER)

In [None]:
# Clone repo of our model to commit there later
if resume_training:
    hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
    download_checkpoint_from_hf_hub(custom_model_name, HF_USER, hf_pwd)
    hf_pwd = ''

In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(orig_model_name, use_cache=False)
tokenizer = AutoTokenizer.from_pretrained(orig_model_name)

In [None]:
file_config = PoemsFileConfig.from_json(Path(train_text_path).parent/'all_poems.en.conf.json')
begin_verse_id, end_verse_id, end_poem_id = setup_tokenizer(tokenizer, 
                                                            file_config, 
                                                            model)
tokenizer.all_special_tokens, begin_verse_id, end_verse_id, end_poem_id

In [None]:
datasets = get_datasets((train_text_path, valid_text_path), tokenizer, num_procs=1)
datasets

Let's examine an input example:

In [None]:
tokenizer.decode(datasets['train'][1000]['input_ids'])

Load the classifiers needed for conditional loss:

In [None]:
clf_by_topic_checkpoint = "gpt2-poems-clf-by-topic.en"
clf_by_form_checkpoint  = "gpt2-poems-clf-by-form.en"
hf_pwd = 'YOUR_HUGGING_FACE_PASSWORD'
download_checkpoint_from_hf_hub(clf_by_topic_checkpoint, HF_USER, hf_pwd)
download_checkpoint_from_hf_hub(clf_by_form_checkpoint, HF_USER, hf_pwd)
hf_pwd = ''
clf_by_topic = AutoModelForSequenceClassification.from_pretrained(clf_by_topic_checkpoint, use_cache=False)
tokenizer_clf_by_topic = AutoTokenizer.from_pretrained(clf_by_topic_checkpoint)
clf_by_form = AutoModelForSequenceClassification.from_pretrained(clf_by_form_checkpoint, use_cache=False)
tokenizer_clf_by_form = AutoTokenizer.from_pretrained(clf_by_form_checkpoint)

In [None]:
bs = 1
compute_metrics = get_compute_metrics_metadataless(begin_verse_id=begin_verse_id, 
                                                   end_verse_id=end_verse_id, 
                                                   end_poem_id=end_poem_id,
                                                   n_initial_verses_to_ignore=1)

labels_decoder = LabelsDecoderExplained()
cond_loss_by_topic = ConditionalGenLoss(
    clf_by_topic, tokenizer_clf_by_topic, tokenizer, LabelsType.Topics, labels_decoder, file_config, 
    end_poem_id, gen_bov_token_id=begin_verse_id, gen_eov_token_id=end_verse_id, device='cuda',
    # This is needed because `GPT2ForSequenceClassification` doesn't know how to deal with padded
    # inputs when `inputs_embeds` are passed instead of `input_ids`.
    # It also helps with OOM issues.
    max_clf_bs=1,
    #max_clf_input_tokens=600
)
def cond_loss_by_topic_fn(output, labels):
    return cond_loss_by_topic(output[:, :-1], labels[:, 1:])
        

cond_loss_by_form = ConditionalGenLoss(
    clf_by_form, tokenizer_clf_by_form, tokenizer, LabelsType.Forms, labels_decoder, file_config, 
    end_poem_id, gen_bov_token_id=begin_verse_id, gen_eov_token_id=end_verse_id, device='cuda',
    # This is needed because `GPT2ForSequenceClassification` doesn't know how to deal with padded
    # inputs when `inputs_embeds` are passed instead of `input_ids`.
    # It also helps with OOM issues.
    max_clf_bs=1,
    #max_clf_input_tokens=600,
)
def cond_loss_by_form_fn(output, labels):
    return cond_loss_by_form(output[:, :-1], labels[:, 1:])


extra_loss_fns = {
    'topic cond loss': (cond_loss_by_topic_fn, 1.),
    'form cond loss': (cond_loss_by_form_fn, 1.),
}


trainer = build_trainer(model, tokenizer, datasets, 50, bs=bs, learning_rate=5e-5,
                        compute_metrics=compute_metrics,
                        preprocess_logits_for_metrics=preprocess_logits_for_metadataless_loss,
                        extra_loss_fns=extra_loss_fns,
                        #fp16=True, 
                        gradient_checkpointing=True,
                        #optim='adafactor'
                        adafactor=True, # deprecated soon, change to above when HF version > 4.15
)

In [None]:
# Temporarily disable GPT2ForSequenceClassification warning for its inability
# to handle padding
transformers.logging.set_verbosity_error()

In [None]:
# with torch.profiler.profile(
#         schedule=torch.profiler.schedule(wait=0, warmup=0, active=4, repeat=10),
#         #schedule=torch.profiler.schedule(wait=0, warmup=1, active=3, repeat=10),
#         on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
#         record_shapes=True,
#         profile_memory=True,
#         with_stack=True
# ) as prof:
# To save profiling info, execute `prof.step()` inside some callback
# The Tensorboard Pytorch Profiler is able to load the generated logs
trainer.train(resume_from_checkpoint=custom_model_name if resume_training else None)

In [None]:
!ls -l checkpoints
custom_model_name, get_last_checkpoint('./checkpoints')

In [None]:
commit_checkpoint_to_hf_hub(custom_model_name, HF_USER, get_last_checkpoint('./checkpoints'),
                            message='Add model, 4 epochs', pwd='')