https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners

In [16]:
import pandas as pd
import numpy as np
import re

In [17]:


def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s



In [18]:
df = pd.read_csv("Articles.csv", encoding="ISO-8859-1") 
df = df.dropna()
text_data = open('Articles.txt', 'w')
for idx, item in df.iterrows():
  article = cleaning(item["Article"])
  text_data.write(article)
text_data.close()



In [19]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [20]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps=500):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          save_strategy="steps",
          save_steps=save_steps,
          save_total_limit=10,
          #resume_from_checkpoint="model/checkpoint-xxxxx",
          #no_cuda=True,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [21]:
train_file_path = "Articles.txt"
model_name = 'gpt2'
output_dir = 'model/'
overwrite_output_dir = False
per_device_train_batch_size = 4
num_train_epochs = 5.0
save_steps = 100

In [22]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

loading file vocab.json from cache at C:\Users\malachmann/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\vocab.json
loading file merges.txt from cache at C:\Users\malachmann/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\malachmann/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_hea

  0%|          | 0/10015 [00:00<?, ?it/s]

{'loss': 3.7824, 'learning_rate': 4.7503744383424864e-05, 'epoch': 0.25}
{'loss': 3.5211, 'learning_rate': 4.5007488766849725e-05, 'epoch': 0.5}
{'loss': 3.4245, 'learning_rate': 4.2511233150274586e-05, 'epoch': 0.75}
{'loss': 3.3394, 'learning_rate': 4.0014977533699454e-05, 'epoch': 1.0}


Saving model checkpoint to model/checkpoint-2003
Configuration saved in model/checkpoint-2003\config.json
Configuration saved in model/checkpoint-2003\generation_config.json
Model weights saved in model/checkpoint-2003\pytorch_model.bin


{'loss': 3.0928, 'learning_rate': 3.7518721917124315e-05, 'epoch': 1.25}
{'loss': 3.081, 'learning_rate': 3.5022466300549176e-05, 'epoch': 1.5}
{'loss': 3.0521, 'learning_rate': 3.252621068397404e-05, 'epoch': 1.75}
{'loss': 3.0246, 'learning_rate': 3.0029955067398902e-05, 'epoch': 2.0}


Saving model checkpoint to model/checkpoint-4006
Configuration saved in model/checkpoint-4006\config.json
Configuration saved in model/checkpoint-4006\generation_config.json
Model weights saved in model/checkpoint-4006\pytorch_model.bin


{'loss': 2.8817, 'learning_rate': 2.7533699450823763e-05, 'epoch': 2.25}
{'loss': 2.8536, 'learning_rate': 2.503744383424863e-05, 'epoch': 2.5}
{'loss': 2.8481, 'learning_rate': 2.254118821767349e-05, 'epoch': 2.75}
{'loss': 2.8438, 'learning_rate': 2.0044932601098353e-05, 'epoch': 3.0}


Saving model checkpoint to model/checkpoint-6009
Configuration saved in model/checkpoint-6009\config.json
Configuration saved in model/checkpoint-6009\generation_config.json
Model weights saved in model/checkpoint-6009\pytorch_model.bin


{'loss': 2.7147, 'learning_rate': 1.7548676984523215e-05, 'epoch': 3.25}
{'loss': 2.7272, 'learning_rate': 1.5052421367948077e-05, 'epoch': 3.49}
{'loss': 2.7135, 'learning_rate': 1.2556165751372942e-05, 'epoch': 3.74}
{'loss': 2.7327, 'learning_rate': 1.0059910134797803e-05, 'epoch': 3.99}


Saving model checkpoint to model/checkpoint-8012
Configuration saved in model/checkpoint-8012\config.json
Configuration saved in model/checkpoint-8012\generation_config.json
Model weights saved in model/checkpoint-8012\pytorch_model.bin


{'loss': 2.641, 'learning_rate': 7.563654518222667e-06, 'epoch': 4.24}
{'loss': 2.6477, 'learning_rate': 5.067398901647529e-06, 'epoch': 4.49}
{'loss': 2.6409, 'learning_rate': 2.5711432850723917e-06, 'epoch': 4.74}
{'loss': 2.6277, 'learning_rate': 7.488766849725412e-08, 'epoch': 4.99}


Saving model checkpoint to model/checkpoint-10015
Configuration saved in model/checkpoint-10015\config.json
Configuration saved in model/checkpoint-10015\generation_config.json
Model weights saved in model/checkpoint-10015\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to model/
Configuration saved in model/config.json
Configuration saved in model/generation_config.json


{'train_runtime': 5184.7799, 'train_samples_per_second': 7.725, 'train_steps_per_second': 1.932, 'train_loss': 2.9590925303329185, 'epoch': 5.0}


Model weights saved in model/pytorch_model.bin


In [23]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [26]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "model/"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [28]:
sequence = input() # oil price
max_len = int(input()) # 20
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed


loading configuration file model/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file mo

gold price in Pakistan and Nigeria is likely to remain steady after the recent OPEC oil policy meeting in which members agreed to stabilise prices.However, some analysts warn that growth may also not be on the way back up in the face of increased U.
