## Load Model and Dataset

In [1]:
!pip install --upgrade huggingface_hub



In [2]:
!pip install transformers
!pip install accelerate>=0.20.1
!pip install transformers[torch]



In [13]:
!pip install evaluate
!pip install torchmetrics
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=3500fead1d65f5b26f39bfaa7ce20f6b95f140dda5284a057c67c19cdc61038b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, EvalPrediction
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_file_path = "/content/drive/MyDrive/lovepoem/lovepoem200.txt"
valid_file_path = "/content/drive/MyDrive/lovepoem/lovepoem25.txt"
output_dir = '/content/drive/MyDrive/lovepoem/results/test_allmetric/'

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file_path,
        block_size=128)

val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=valid_file_path,
        block_size=128)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )



## Train

In [14]:
import evaluate
import numpy as np
from scipy.special import softmax
from sklearn.metrics import log_loss
# import torchmetrics
# from torchmetrics.text import Perplexity


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    metric_bleu = evaluate.load("bleu")
    metric_rouge = evaluate.load("rouge")

    preds, labels = eval_preds
    # perp = Perplexity(preds, labels)
    probs = softmax(preds, axis=-1)
    loss = log_loss(labels.flatten(), probs.reshape(-1, probs.shape[-1]), labels=[i for i in range(preds.shape[-1])])
    perplexity = np.exp(loss)
    preds = preds.argmax(axis=-1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result_rouge["bleu"] = result_bleu["bleu"]
    result_rouge["perplexity"] = perplexity

    return result_rouge

In [15]:
training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=False,
          per_device_train_batch_size=8,
          num_train_epochs=5.0,
          # save_steps=500,
          logging_steps = 25,
          load_best_model_at_end=True,
          evaluation_strategy='epoch',
          save_strategy='epoch',
          metric_for_best_model='bleu',
          # greater_is_better=False,
          save_safetensors=False,
          push_to_hub=False
          )

In [16]:
trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
          eval_dataset=val_dataset,
          compute_metrics=compute_metrics
          )

In [17]:
trainer.train()
trainer.save_model()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Perplexity
1,4.0441,4.527138,0.392959,0.061325,0.236597,0.384297,0.023396,6484.730295
2,3.9213,4.546423,0.390172,0.055296,0.231218,0.380037,0.019831,8781.293087
3,3.741,4.570492,0.388749,0.05237,0.22989,0.377671,0.017569,9202.219496
4,3.672,4.586753,0.383492,0.053595,0.229418,0.372717,0.01543,10580.769215
5,3.585,4.599734,0.384562,0.052332,0.228959,0.373987,0.015836,11345.473559


## Generation

In [20]:
def generate_text(sequence, max_length):
    model = GPT2LMHeadModel.from_pretrained(output_dir)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [23]:
sequence = "Love"
max_len = 200
poem = generate_text(sequence, max_len)
print(poem)

Love
I loved you
in vain
until I found you
in my arms
in heaven
in dreams I was a happy man 
