# GPT-2 Fine-Tuning

## Step 1. Data preprocessing

#### the data contains unnecessary newlines, tags, and URLs it will be necessary to remove them before preprocessing.

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [None]:
df = pd.read_csv("Articles.csv", encoding="ISO-8859-1") 
df = df.dropna()

text_data = open('Articles.txt', 'w')
for idx, item in df.iterrows():
  article = cleaning(item["Article"])
  text_data.write(article)
text_data.close()


## Step 2. Model Training

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 60.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 30.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# you need to set parameters 
train_file_path = "/content/Articles.txt"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [None]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 8024
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5015
  Number of trainable parameters = 124439808


Step,Training Loss
500,3.6994
1000,3.4088
1500,3.1663
2000,3.1265
2500,2.9754
3000,2.9591
3500,2.8561
4000,2.8525
4500,2.7915
5000,2.7821


Saving model checkpoint to /content/drive/MyDrive/result/checkpoint-500
Configuration saved in /content/drive/MyDrive/result/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/result/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/result/checkpoint-1000
Configuration saved in /content/drive/MyDrive/result/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/result/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/result/checkpoint-1500
Configuration saved in /content/drive/MyDrive/result/checkpoint-1500/config.json
Model weights saved in /content/drive/MyDrive/result/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/result/checkpoint-2000
Configuration saved in /content/drive/MyDrive/result/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/result/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /content/dri

## Step 3. Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(model,tok,sequence, max_length):
    ids = tok.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return (tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
model = load_model("/content/drive/MyDrive/result")
tokenizer = load_tokenizer("/content/drive/MyDrive/result")

loading configuration file /content/drive/MyDrive/result/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50257
}


In [None]:
sequence = input() 
max_len = int(input()) 
generate_text(model,tokenizer,sequence, max_len) 

jahnavi is a good girl
35
jahnavi is a good girl. She is a good girl. I am a good boy, and she is a good girl.Pakistan Cricket Board PCB) chief


In [None]:
sequence = input() 
max_len = int(input()) 
generate_text(model,tokenizer,sequence, max_len) 

education is an endless ocean
200
education is an endless ocean for talent, where no small number of talented youngsters have reached senior levels."The government´s enomic reforms have focused on raising productivity and improving the efficiency of businesses, while providing an opportunity to improve the living standards of the people," said Ali Qasim, head of reform efforts in the central bank´s enomic affairs bureau.Qasim emphasized that the enomic reform package would not only improve business performance but would also help mmodity markets as it would help bring down the current sts of state investment in agriculture and energy as well as on education.The government will boost the export of agricultural chemicals by. million tonnes this year while the import of chemicals in rporate form from third-party mponents will reach. million tonnes, he said, adding the government is also introducing reforms in education and employment policies.strong>WASHINGTON: Asian markets had rerded th

In [None]:
sequence = input() 
max_len = int(input()) 
generate_text(model,tokenizer,sequence, max_len) 

My life
50
My life was about to change, and we had to do it.It´s been tough but we have been working hard and everything is finally getting going. The hope is that by now we have learnt a lot from the success of this year.


In [None]:
sequence = input() 
max_len = int(input()) 
text = generate_text(model,tokenizer,sequence,max_len)
type(text)



my life
30


str

In [None]:
text = input() 
max_len = int(input())
user_in="no"
while(user_in != "yes"):
  text = generate_text(model,tokenizer,text,max_len)
  print(text)
  max_len=len(text)
  user_in=input("\nNot satisfied with output? enter yes if you are satisfied, no if not satisfied\n")


oil prices are high
10
oil prices are high because of the US shale boom

Not satisfied with output? enter yes if you are satisfied, no if not satisfied
no
oil prices are high because of the US shale boom.In the past two months, oil prices have jumped. percent, driven by the surge in US crude that is now in excess of a tonne and as a result crude supplies are lower

Not satisfied with output? enter yes if you are satisfied, no if not satisfied
no
oil prices are high because of the US shale boom.In the past two months, oil prices have jumped. percent, driven by the surge in US crude that is now in excess of a tonne and as a result crude supplies are lower," the mments said. But US shale production, which was last around, barrels per day, declined by the time Obama took office to, barrels per day, acrding to data from the Baker Hughes in Baton Rouge, Louisiana.US shale oil drilling activity rose percent in the month of January from the month of December while in November production increa

In [None]:
text = input() 
max_len = int(input()) 
user_in=2
while(user_in<=3):
  text = generate_text(model,tokenizer,text,max_len)
  print(text)
  max_len=len(text)
  user_in=int(input("\nRate the output between 1 to 5: \n"))

In [None]:
do
sequence = input() 
max_len = int(input()) 
text=generate_text(model,tokenizer,sequence, max_len) 
print(text)
user_in=input("not satisfied with output? enter yes if u are satisfied, no if not satisfied")
while(user_in != "yes"):
  text = generate_text(model,tokenizer,text,len(text))
  print(text)
  user_in=input("not satisfied with output? enter yes if u are satisfied, no if not satisfied")
