# Fine-tune GPT2 with Eval_dataset

## Load Dataset

In [1]:
!pip install datasets
from datasets import load_dataset

# Load dataset: brianarbuckle/cocktail_recipes
# 加载数据集，此处的数据集为brianarbuckle/cocktail_recipes，鸡尾酒菜单
dataset = load_dataset('brianarbuckle/cocktail_recipes', split='train')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Load Tokenizer

In [2]:
from transformers import GPT2Tokenizer

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# set pad_token of tokenizer as eos_token
tokenizer.pad_token = tokenizer.eos_token

## Preprocess the Dataset

In [3]:
import torch
from torch.utils.data import Dataset

In [4]:
#define MyDataset
class MyDataset(Dataset):
    #self为类的实例，实例属性为tokenizer,recipes,block_size
    def __init__(self, tokenizer, recipes, block_size):
        self.tokenizer = tokenizer #由tokenizer给实例self的属性tokenizer赋值
        self.samples = []

        for recipe in recipes:
            text = recipe['title'] + " "  # Start with the title
            text += "Ingredients: " + ', '.join(recipe['ingredients']) + ". "  # Add ingredients
            text += "Directions: " + ' '.join(recipe['directions']) + ". "  # Add directions

            # call __call__ from tokenizer for automatic padding
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")

            # adding tokenized_text to samples
            self.samples.append(tokenized_text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # return a dict: input_ids, attention_mask and labels
        sample = self.samples[idx]
        # for language models, labels equal to input_ids in general
        sample["labels"] = sample["input_ids"].clone()
        return {key: value.squeeze(0) for key, value in sample.items()}

# Prepare the list of recipe texts
# 'dataset' yields dictionaries directly
recipes = [example for example in dataset]

# recreate dataset
my_dataset = MyDataset(tokenizer, recipes, block_size=128)

## Split the Dataset

In [5]:
from sklearn.model_selection import train_test_split
# divide the dataset into training and testing parts
train_dataset, eval_dataset = train_test_split(my_dataset, test_size=0.2, random_state=42)

## Load Model (GPT2-meduim)

In [6]:
from transformers import GPT2LMHeadModel

# loading pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

## Define evaluation metrics

### previous

In [7]:
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

### Perplexity Evaluation

In [8]:
from torch.utils.data import DataLoader

# 将数据集转换为DataLoader
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=True)

In [9]:
def evaluate_perplexity(model, tokenizer, dataloader):
    model.eval()
    total_loss = 0
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']
        with torch.no_grad():
            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss
            total_loss += loss.item()
    average_loss = total_loss / len(dataloader)
    perplexity = torch.exp(torch.tensor(average_loss))
    return perplexity.item()
# Ensure your dataloader returns a batch in the format expected ('input_ids', 'labels', and 'attention_mask')
perplexity = evaluate_perplexity(model, tokenizer, eval_dataloader)
print(f"Perplexity of GPT2: {perplexity}")

Perplexity of GPT2: 1514.118896484375


## Train and Evaluate

In [23]:
!pip install accelerate -U
from transformers import Trainer, TrainingArguments

# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite to output directory
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=8,   # batch size
    save_steps=1000,                 # save model per steps
    save_total_limit=2,              # total number of saved models
    logging_dir='./logs',            # log directory
    logging_steps=100,               # save log per steps
#    prediction_loss_only=True,      # predict the loss only, default as False
    learning_rate=1e-5,              # learning rate
    warmup_steps=500,                # warmup steps
    gradient_accumulation_steps=5,         # gradient accumulation梯度累积步数
    fp16=True,                   # Mixed Precision Training
)

# Initializing trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # to use specified dataset
    eval_dataset=eval_dataset,  # to designate evaluation dataset if needed
    compute_metrics=evaluate_perplexity,
)

trainer.train()



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


TrainOutput(global_step=68, training_loss=0.7464780246510225, metrics={'train_runtime': 106.2547, 'train_samples_per_second': 26.352, 'train_steps_per_second': 0.64, 'total_flos': 628730370195456.0, 'train_loss': 0.7464780246510225, 'epoch': 3.86})

In [24]:
torch.cuda.empty_cache()
trainer.evaluate()

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.92 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.56 GiB is free. Process 97143 has 13.19 GiB memory in use. Of the allocated memory 10.87 GiB is allocated by PyTorch, and 2.18 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Save

In [None]:
# Set drive location as /mydrive
!ln -s /content/drive/MyDrive/ /mydrive
# See inside of /mdrive folder
!ls /mydrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# save fine-tuned model
model.save_pretrained('./drive/MyDrive')

# save tokenizer
tokenizer.save_pretrained('./drive/MyDrive')

## Use with GUI windows

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [None]:
# load our model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')

# create pipeline with our model and tokenizer
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# test the function
print(generator("a kind of saulty cocktail", max_length=100))

### Create GUI windows(don't run in colab due to lack of environment)

In [None]:
import tkinter as tk

def commit_requirements():
  commands = entry.get()
  recommendation = generator(commands, max_length=200)
  label2.config(text=f"Here is the recommandation:\n{recommendation}")

# create the main window
root = tk.Tk()
root.title("Cocktail Asistant")
# label1
label1 = tk.Label(root, text="Welcome to Cocktail Asistant! Please write your command:")
label1.pack(pady=10)
# entry widget
entry = tk.Entry(root)
entry.pack(pady=10)
# button
button = tk.Button(root, text="commit", command=commit_requirements)
button.pack(pady=10)
# label2
label2 = tk.Label(root, text="I'll offer a recipe for you")
label2.pack(pady=10)
# start the event loop
root.mainloop()