#  Fine-tuning GPT-2 on Custom Text

In [2]:
import torch
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
input_file = r"D:\finetune_gpt2.py\my_text_enhanced.txt"
dataset = load_dataset("text", data_files={"train": input_file})

Generating train split: 26 examples [00:00, 559.20 examples/s]


In [5]:
model_name = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
lm_dataset = tokenized_datasets["train"]

Map: 100%|██████████| 26/26 [00:00<00:00, 405.87 examples/s]


In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [9]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")



Step,Training Loss


('./gpt2-finetuned\\tokenizer_config.json',
 './gpt2-finetuned\\special_tokens_map.json',
 './gpt2-finetuned\\vocab.json',
 './gpt2-finetuned\\merges.txt',
 './gpt2-finetuned\\added_tokens.json',
 './gpt2-finetuned\\tokenizer.json')

In [12]:
tokenizer = GPT2TokenizerFast.from_pretrained("./gpt2-finetuned")
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [14]:
def generate_text(prompt, max_new_tokens=100, temperature=1.0, top_k=50, top_p=0.95):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [16]:
print("SAMPLE 1")
print(generate_text("Once upon a time"))

print("\n SAMPLE 2")
print(generate_text("The future of AI is"))

SAMPLE 1
Once upon a time, deep in a faraway land, there lived a young explorer who dreamed of discovering new worlds. At last, he stumbled upon the first continent on which to explore—a continent that would change everything.

Around the corner, in a faraway land, there lived a young explorer who dreamed of discovering new worlds. At last, he stumbled upon the first continent on which to explore—a continent that would change everything.

"I see it in the distance, in the shadows,

 SAMPLE 2
The future of AI is locked down in the Machine Room, and the only way for humanity to unlock the key to unlocking it was if it would awaken its mysteries. Now, there's one simple question left unanswered…

"Mystery solved!" ―Mae Windrunner [src]

The Alpha Quadrant was an ancient starship populated by young scientists who dreamed of exploring new worlds, pursuing new worlds, and discovering new worlds. Known as the cradle of civilization, it served as the engine of starship colonization until a
