# Fine-tune GPT2 with Eval_dataset

In [1]:
!pip install datasets
!pip install accelerate -U
!pip install evaluate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

## Load Dataset

In [9]:
from datasets import load_dataset

# Load dataset: brianarbuckle/cocktail_recipes
# 加载数据集，此处的数据集为brianarbuckle/cocktail_recipes，鸡尾酒菜单
dataset = load_dataset('brianarbuckle/cocktail_recipes', split='train')

## Load Tokenizer

In [3]:
from transformers import GPT2Tokenizer

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', max_length=128)

# set pad_token of tokenizer as eos_token
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

## Preprocess the Dataset

In [10]:
import torch
from torch.utils.data import Dataset

In [11]:
#define MyDataset
class MyDataset(Dataset):
    #self为类的实例，实例属性为tokenizer,recipes,block_size
    def __init__(self, tokenizer, recipes, block_size):
        self.tokenizer = tokenizer #由tokenizer给实例self的属性tokenizer赋值
        self.samples = []
        self.block_size = block_size

        for recipe in recipes:
            text = recipe['title'] + " "  # Start with the title
            text += "Ingredients: " + ', '.join(recipe['ingredients']) + ". "  # Add ingredients
            text += "Directions: " + ' '.join(recipe['directions']) + ". "  # Add directions

            # call __call__ from tokenizer for automatic padding
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")

            # adding tokenized_text to samples
            self.samples.append(tokenized_text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # return a dict: input_ids, attention_mask and labels
        sample = self.samples[idx]
        # for language models, labels equal to input_ids in general
        sample["labels"] = sample["input_ids"].clone()
        return {key: value.squeeze(0) for key, value in sample.items()}

# Prepare the list of recipe texts
# 'dataset' yields dictionaries directly
recipes = [example for example in dataset]

# recreate dataset
my_dataset = MyDataset(tokenizer, recipes, block_size=128)

## Split the Dataset

In [12]:
from sklearn.model_selection import train_test_split
# divide the dataset into training and testing parts
# shrunk_dataset, _ = train_test_split(my_dataset, test_size=0.2, random_state=42)
train_dataset, eval_dataset = train_test_split(my_dataset, test_size=0.2, random_state=42)
# print("size of previous dataset:", len(my_dataset))
# print("size of shrunk dataset:", len(shrunk_dataset))
print("size of training dataset:", len(train_dataset))
print("size of evaluation dataset:", len(eval_dataset))

size of training dataset: 700
size of evaluation dataset: 175


## Load Model (GPT2-meduim)

In [7]:
from transformers import GPT2LMHeadModel

# loading pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## Define evaluation metrics

### previous

In [8]:
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

### Perplexity Evaluation

In [5]:
import evaluate

In [14]:
from torch.utils.data import DataLoader

# 将数据集转换为DataLoader
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=True)

In [23]:
def evaluate_perplexity(model, tokenizer, dataloader):
    model.eval()
    total_loss = 0
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']
        with torch.no_grad():
            outputs = model(inputs, labels = labels, attention_mask = attention_mask)
            loss = outputs.loss
            total_loss += loss.item()
    average_loss = total_loss / len(dataloader)
    perplexity = torch.exp(torch.tensor(average_loss))
    return perplexity.item()
# Ensure your dataloader returns a batch in the format expected ('input_ids', 'labels', and 'attention_mask')

# Define BLEU
def evaluate_bleu(model, tokenizer, dataloader):
    model.eval()
    bleu = evaluate.load("bleu")
    all_predictions = []
    all_references = []
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        # Truncate input_ids to length 20
        # print(inputs['input_ids'].shape)
        # inputs['input_ids'] = inputs['input_ids'][:, :20]

        # generate predictions
        with torch.no_grad():
            outputs = model.generate(inputs, labels = labels, attention_mask = attention_mask)

        predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        references = tokenizer.batch_decode(labels, skip_special_tokens=True)

        all_predictions.extend(predictions)
        all_references.extend(references)

    bleu_score = bleu.compute(predictions = all_predictions, references = all_references)
    return bleu_score

## Train

In [12]:
from transformers import Trainer, TrainingArguments

In [1]:
# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite to output directory
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=4,   # batch size
    save_steps=1000,                 # save model per steps
    save_total_limit=2,              # total number of saved models
    logging_dir='./logs',            # log directory
    logging_steps=100,               # save log per steps
    # prediction_loss_only=True,      # predict the loss only, default as False
    learning_rate=1e-4,              # learning rate
    warmup_steps=500,                # warmup steps
    # gradient_accumulation_steps=5,         # gradient accumulation梯度累积步数
    # fp16=True,                   # Mixed Precision Training
)

# Initializing trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # to use specified dataset
    eval_dataset=eval_dataset,  # to designate evaluation dataset if needed
    # compute_metrics=evaluate_perplexity,
)

trainer.train()

NameError: name 'TrainingArguments' is not defined

In [None]:
torch.cuda.empty_cache()
trainer.evaluate()

## Save

In [None]:
# Set drive location as /mydrive
!ln -s /content/drive/MyDrive/ /mydrive
# See inside of /mdrive folder
!ls /mydrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# save fine-tuned model
model.save_pretrained('./drive/MyDrive')

# save tokenizer
tokenizer.save_pretrained('./drive/MyDrive')

## Evaluate

In [15]:
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive/20240416_saved_model_and_tokenizer')

perplexity = evaluate_perplexity(model, tokenizer, eval_dataloader)
print(f"Perplexity: {perplexity}")

bleu = evaluate_bleu(model, tokenizer, eval_dataloader)
print(f"BLEU: {bleu}")

Perplexity: 2.155730962753296


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ValueError: Input length of input_ids is 128, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.