The following is the code to finetune a gpt2 model for paraphrase generation

In [None]:
# install the relevant packages
!pip install transformers pandas datasets pynvml huggingface_hub sentence-transformers rouge_score

In [None]:
from transformers import(
       AutoModelWithLMHead,
       AutoConfig,
       Trainer,
       AutoTokenizer,
       TextDataset,
       DataCollatorForLanguageModeling,
       TrainingArguments,
       pipeline
)
import gc
import torch
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
config = {
       "model_name":"gpt2-large", # gpt2, gpt2-medium, gpt2-large, gpt2-xl 
       "train_dataset_filename":"combined.txt",
       "data_collator_block_size":256,
       "output_dir":"gpt2-large-paraphraser",
       "batch_size":8,
       "epochs":1
}

In [None]:
# define the model and the tokenizer from the specified model name
model = AutoModelWithLMHead.from_pretrained(config["model_name"])
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])


In [None]:
# define the text dataset
# The dataset of comprised of paraphrases in the form <s> sentence_1 </>>>>><p> paraphrase_1 </p>
# each new paraphrase is on a new line
dataset = TextDataset(tokenizer = tokenizer, file_path = config["train_dataset_filename"], block_size = config["data_collator_block_size"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
# define training arguments for the trainer
mn = config["model_name"]
training_args = TrainingArguments(
       output_dir=config["output_dir"],
       per_device_train_batch_size=config["batch_size"],
       num_train_epochs=config["epochs"],
       hub_model_id=f"{mn}-paraphraser",
       push_to_hub=True
)

In [None]:
# define the trainer object to traint the model
trainer = Trainer(
       model=model,
       args=training_args,
       data_collator=data_collator,
       train_dataset=dataset
)

Cloning https://huggingface.co/SRM47/gpt2-large-paraphraser into local empty directory.


In [None]:

def report_gpu():
   print(torch.cuda.list_gpu_processes())
   gc.collect()
   torch.cuda.empty_cache()


In [None]:
report_gpu()

GPU:0
process     157608 uses     4091.000 MB GPU memory


In [None]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 34285
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4286
  Number of trainable parameters = 774030080


Step,Training Loss
500,1.4471
1000,1.3453
1500,1.2916
2000,1.2534
2500,1.2177
3000,1.1954
3500,1.18
4000,1.1677


Saving model checkpoint to gpt2-large-paraphraser/checkpoint-500
Configuration saved in gpt2-large-paraphraser/checkpoint-500/config.json
Model weights saved in gpt2-large-paraphraser/checkpoint-500/pytorch_model.bin
Saving model checkpoint to gpt2-large-paraphraser/checkpoint-1000
Configuration saved in gpt2-large-paraphraser/checkpoint-1000/config.json
Model weights saved in gpt2-large-paraphraser/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to gpt2-large-paraphraser/checkpoint-1500
Configuration saved in gpt2-large-paraphraser/checkpoint-1500/config.json
Model weights saved in gpt2-large-paraphraser/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to gpt2-large-paraphraser/checkpoint-2000
Configuration saved in gpt2-large-paraphraser/checkpoint-2000/config.json
Model weights saved in gpt2-large-paraphraser/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to gpt2-large-paraphraser/checkpoint-2500
Configuration saved in gpt2-large-paraphraser/checkpoint-

TrainOutput(global_step=4286, training_loss=1.2559782126992316, metrics={'train_runtime': 3473.3691, 'train_samples_per_second': 9.871, 'train_steps_per_second': 1.234, 'total_flos': 3.7305085231104e+16, 'train_loss': 1.2559782126992316, 'epoch': 1.0})

In [None]:
trainer.push_to_hub()

Saving model checkpoint to gpt2-large-paraphraser
Configuration saved in gpt2-large-paraphraser/config.json
Model weights saved in gpt2-large-paraphraser/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.30k/2.92G [00:00<?, ?B/s]

Upload file runs/Dec15_10-50-33_26df7bda6c89/events.out.tfevents.1671101446.26df7bda6c89.3692.0:  60%|######  …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/SRM47/gpt2-large-paraphraser
   5cad7fd..f0aada7  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/SRM47/gpt2-large-paraphraser
   5cad7fd..f0aada7  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/SRM47/gpt2-large-paraphraser
   f0aada7..eaa20f2  main -> main

   f0aada7..eaa20f2  main -> main



'https://huggingface.co/SRM47/gpt2-large-paraphraser/commit/f0aada786825d47d34558e48f85329a37f187cf8'

In [None]:
tokenizer.push_to_hub("SRM47/gpt2-large-paraphraser")

tokenizer config file saved in gpt2-large-paraphraser/tokenizer_config.json
Special tokens file saved in gpt2-large-paraphraser/special_tokens_map.json
Uploading the following files to SRM47/gpt2-large-paraphraser: tokenizer.json,special_tokens_map.json,vocab.json,tokenizer_config.json,merges.txt


CommitInfo(commit_url='https://huggingface.co/SRM47/gpt2-large-paraphraser/commit/531395b05b0f4fdde7c13fe1e65865e04c44188f', commit_message='Upload tokenizer', commit_description='', oid='531395b05b0f4fdde7c13fe1e65865e04c44188f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# save the model to defined path
model_name = config["model_name"]
save_to_path = f"./{model_name}-paraphraser"
trainer.save_model(save_to_path)