<a href="https://colab.research.google.com/github/ThisIsFarhan/GPT2-FineTuning/blob/main/gpt2_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets



In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset, load_dataset

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from google.colab import files
uploaded = files.upload()

Saving scene_descriptions_dataset.json to scene_descriptions_dataset.json


In [None]:
data_id = "scene_descriptions_dataset.json"

In [None]:
def prepare_train_data(data_id):
    data = load_dataset("json", data_files=data_id, split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["input", "output"]].apply(lambda x: x["input"] + ".\n" + x["output"], axis=1)
    data = Dataset.from_pandas(data_df[['text']])
    return data

In [None]:
dataset = prepare_train_data(data_id)

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [None]:
dataset['text'][0]

'left_input: phone, chair, lamp\nright_input: cabinet, person, human\nup_input: mouse, glasses\nbottom_input: laptop, shelf.\nIn the left side area, there were a phone, a chair, and a lamp. In the right area, there were a cabinet, a person, and a human. In the above area, there were a mouse and a glasses. a laptop and a shelf were detected in the below section.'

In [None]:
def tokenize_and_add(data):
  tokens = tokenizer(data['text'], truncation=True, padding='max_length', max_length=180)
  return {
      'input_ids':tokens['input_ids'],
      'labels':tokens['input_ids']
  }

tokenized_dataset = dataset.map(tokenize_and_add, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'labels'],
    num_rows: 1000
})

In [None]:
training_args = TrainingArguments(
    output_dir="content/gpt2-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_dir = "./logs",
    logging_steps = 10,
    save_steps=10,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarhanak128[0m ([33mfarhanak128-comsats-university-islamabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,6.9881
20,1.3316
30,0.6116
40,0.4406
50,0.3457
60,0.3127
70,0.3017
80,0.2841
90,0.2681
100,0.2695


TrainOutput(global_step=375, training_loss=0.46794338258107504, metrics={'train_runtime': 486.6043, 'train_samples_per_second': 6.165, 'train_steps_per_second': 0.771, 'total_flos': 137793208320000.0, 'train_loss': 0.46794338258107504, 'epoch': 3.0})

In [None]:
model.save_pretrained('./gpt2-FYP')
tokenizer.save_pretrained('./gpt2-FYP')

('./gpt2-FYP/tokenizer_config.json',
 './gpt2-FYP/special_tokens_map.json',
 './gpt2-FYP/vocab.json',
 './gpt2-FYP/merges.txt',
 './gpt2-FYP/added_tokens.json')

In [None]:
model_path = './gpt2-FYP'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

def response(input):
  inputs = tokenizer.encode(input,return_tensors="pt")
  outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return response

In [None]:
response('left_input: phone, chair, lamp\nright_input: cabinet')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'left_input: phone, chair, lamp\nright_input: cabinet, lamp\nup_input: lamp, lamp, person.\na phone, a chair, and a lamp were observed left side. a cabinet, a lamp, and a person were observed right side. a lamp, a lamp, and a person were observed upper.'

In [None]:
!zip -r /content/gpt2_finetuned_model.zip /content/gpt2-FYP

  adding: content/gpt2-FYP/ (stored 0%)
  adding: content/gpt2-FYP/model.safetensors (deflated 7%)
  adding: content/gpt2-FYP/tokenizer_config.json (deflated 56%)
  adding: content/gpt2-FYP/vocab.json (deflated 68%)
  adding: content/gpt2-FYP/generation_config.json (deflated 24%)
  adding: content/gpt2-FYP/merges.txt (deflated 53%)
  adding: content/gpt2-FYP/config.json (deflated 51%)
  adding: content/gpt2-FYP/special_tokens_map.json (deflated 74%)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/gpt2_finetuned_model.zip /content/drive/MyDrive/

Mounted at /content/drive
