<a href="https://colab.research.google.com/github/Sciform/sciform-hwz-ai-in-controlling/blob/main/lecture_3/3_2_fine_tuning_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to fine-tune GPT2

In [1]:
!pip install torch
!pip install transformers[torch]
!pip install datasets
!pip install jsonlines

Collecting transformers[torch]
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m2

In [2]:
import pandas as pd
import datasets

import logging
from pprint import pprint

import torch

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, EvalPrediction
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import numpy as np
from scipy.special import softmax
from sklearn.metrics import log_loss


logger = logging.getLogger(__name__)

In [3]:
from datasets import load_dataset

lamini_dataset_path = "lamini/lamini_docs"
lamini_dataset = load_dataset(lamini_dataset_path)

train_dataset = lamini_dataset['train']
test_dataset = lamini_dataset['test']

print(train_dataset)
print(test_dataset)

Downloading readme:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/615k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/140 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})


In [4]:
# https://huggingface.co/docs/transformers/tasks/question_answering

print("question = ", train_dataset['question'][0])
print("answer = ", train_dataset['answer'][0])

question =  How can I evaluate the performance and quality of the generated text from Lamini models?
answer =  There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance.


In [5]:
model_name = "gpt2"
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load GPT-2 model
base_model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

print(device)

cpu


In [7]:
base_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

# Use the base GPT2 model first

In [9]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question input (test): Can Lamini generate technical documentation or user manuals for software projects?
Correct answer from Lamini docs: Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.
Model's answer: 


I don't know. I'm not sure if I can get a good answer.

I'm not sure if I can get a good answer. I'm not sure if I can get a good answer. I'm not sure if I can get a good answer. I'm not sure if I can get a good answer. I'm not sure if I can get a good answer. I'm not sure if I


# Fine-tune model

In [10]:
max_steps = 3

In [11]:
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [12]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)


In [13]:
trainer = Trainer(
    model=base_model,
 #   model_flops=model_flops,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [14]:
training_output = trainer.train()

Step,Training Loss,Validation Loss


In [15]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: lamini_docs_3_steps/final


In [16]:
finetuned_slightly_model = GPT2LMHeadModel.from_pretrained(save_dir, local_files_only=True)

In [17]:
finetuned_slightly_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [18]:
finetuned_longer_model = GPT2LMHeadModel.from_pretrained("lamini/lamini_docs_finetuned")
#tokenizer = GPT2Tokenizer.from_pretrained("lamini/lamini_docs_finetuned")

#finetuned_longer_model.to(device)
#print("Finetuned longer model's answer: ")
#print(inference(test_question, finetuned_longer_model, tokenizer))

Downloading (…)lve/main/config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

You are using a model of type gpt_neox to instantiate a model of type gpt2. This is not supported for all configurations of models and can yield errors.


Downloading model.safetensors:   0%|          | 0.00/282M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at lamini/lamini_docs_finetuned and are newly initialized: ['h.5.mlp.c_proj.weight', 'h.1.mlp.c_fc.weight', 'lm_head.weight', 'h.4.mlp.c_fc.bias', 'h.0.attn.c_attn.weight', 'h.0.ln_2.bias', 'h.0.mlp.c_proj.bias', 'h.0.ln_1.bias', 'h.5.attn.c_proj.weight', 'h.1.ln_1.weight', 'h.2.ln_1.weight', 'h.4.ln_1.bias', 'h.2.mlp.c_fc.weight', 'h.3.attn.c_proj.bias', 'h.3.attn.c_proj.weight', 'h.2.attn.c_proj.bias', 'h.1.attn.c_attn.weight', 'h.0.attn.c_proj.bias', 'h.4.attn.c_proj.weight', 'h.5.ln_2.bias', 'wpe.weight', 'h.2.ln_2.weight', 'h.0.attn.c_proj.weight', 'h.3.ln_2.weight', 'h.3.ln_1.bias', 'h.5.mlp.c_fc.bias', 'h.5.attn.c_attn.bias', 'h.2.ln_1.bias', 'h.2.ln_2.bias', 'h.4.attn.c_attn.bias', 'ln_f.bias', 'h.4.attn.c_attn.weight', 'h.4.ln_2.bias', 'h.1.mlp.c_proj.weight', 'ln_f.weight', 'wte.weight', 'h.4.mlp.c_proj.bias', 'h.2.mlp.c_proj.bias', 'h.3.mlp.c_proj.bias', 'h.0.mlp.c_fc.weight', 'h.3.attn.c_attn.wei

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [19]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question input (test): Can Lamini generate technical documentation or user manuals for software projects?
Finetuned slightly model's answer: 


Lamini is a software development company that has been working on software for over 20 years. We have been working on software for over 20 years. We have been working on software for over 20 years. We have been working on software for over 20 years. We have been working on software for over 20 years. We have been working on software for over 20 years. We have been working on software for over 20 years.


In [20]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

Target answer output (test): Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.
