<a href="https://colab.research.google.com/github/Panda-22/LLM-Team2/blob/main/GPT2_fine_tuning_cocktail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune GPT2 with Eval_dataset

## Load Dataset

In [1]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install evaluate
!pip install rouge-score

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

## Load gpt2 Model and Tokenizer

In [20]:
from transformers import GPT2LMHeadModel

# loading pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

In [21]:
from transformers import GPT2Tokenizer

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

## Preprocess the Dataset

In [2]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset('brianarbuckle/cocktail_recipes', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/3.33k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/96.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/875 [00:00<?, ? examples/s]

In [4]:
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):

    def __init__(self, tokenizer, recipes, block_size):
        self.tokenizer = tokenizer
        self.samples = []

        for recipe in recipes:
            text = recipe['title'] + " "  # Start with the title
            text += "Ingredients: " + ', '.join(recipe['ingredients']) + ". "  # Add ingredients
            text += "Directions: " + ' '.join(recipe['directions']) + ". "  # Add directions

            # call __call__ from tokenizer for automatic padding
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")

            # adding tokenized_text to samples
            self.samples.append(tokenized_text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # return a dict: input_ids, attention_mask and labels
        sample = self.samples[idx]
        # for language models, labels equal to input_ids in general
        sample["labels"] = sample["input_ids"].clone()
        return {key: value.squeeze(0) for key, value in sample.items()}

# Prepare the list of recipe texts
# 'dataset' yields dictionaries directly
recipes = [example for example in dataset]

# recreate dataset
my_dataset = MyDataset(tokenizer, recipes, block_size=128)

## Split the Dataset

In [5]:
from sklearn.model_selection import train_test_split
# divide the dataset into training and testing parts
train_dataset, eval_dataset = train_test_split(my_dataset, test_size=0.1, random_state=42)

## Define Evaluation Metrics

In [25]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import numpy as np
#import evaluate

# Define perplexity
def evaluate_perplexity(model, tokenizer, dataloader):
    model.eval()
    total_loss = 0
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        with torch.no_grad():
            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss
            total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    perplexity = torch.exp(torch.tensor(average_loss))
    return perplexity.item()

# Define BLEU
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
def evaluate_bleu(model, tokenizer, dataloader):
    model.eval()
    smooth = SmoothingFunction().method7  # Choose the desired smoothing function
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        with torch.no_grad():
            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
            for ref_text, pred_text in zip(labels, outputs):
              results = sentence_bleu(str(ref_text), str(pred_text), smoothing_function=smooth)

    print("Smoothed BLEU Score:", results)

# Define ROUGE
from rouge_score import rouge_scorer

def evaluate_rouge(model, tokenizer, dataloader):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_scores = {'rouge1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                    'rouge2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                    'rougeL': {'f': 0.0, 'p': 0.0, 'r': 0.0}}

    total_samples = 0

    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        # Generate model outputs
        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_mask).logits

        # Decode model outputs and labels
        decoded_outputs = tokenizer.decode(outputs.argmax(dim=-1), skip_special_tokens=True)
        decoded_labels = tokenizer.decode(labels, skip_special_tokens=True)

        # Calculate ROUGE scores for each pair of reference and predicted texts
        for ref_text, pred_text in zip(decoded_labels, decoded_outputs):
            scores = scorer.score(ref_text, pred_text)
            for metric, score in scores.items():
                rouge_scores[metric]['f'] += score.fmeasure
                rouge_scores[metric]['p'] += score.precision
                rouge_scores[metric]['r'] += score.recall
            total_samples += 1

    # Average ROUGE scores across all samples
    for metric in rouge_scores.keys():
        for key in ['f', 'p', 'r']:
            rouge_scores[metric][key] /= total_samples

    return rouge_scores


In [None]:
perplexity = evaluate_perplexity(model, tokenizer, eval_dataset)
print(f"Perplexity before training: {perplexity}")

Perplexity before training: 1461.4776611328125


In [26]:
print("ROUGE before training:")
evaluate_rouge(model, tokenizer, eval_dataset)

ROUGE before training:


{'rouge1': {'f': 0.04300774325963243,
  'p': 0.04300774325963243,
  'r': 0.04300774325963243},
 'rouge2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
 'rougeL': {'f': 0.04300774325963243,
  'p': 0.04300774325963243,
  'r': 0.04300774325963243}}

In [22]:
print("BLEU before training:")
evaluate_bleu(model, tokenizer, eval_dataset)

BLEU before training:
Smoothed BLEU Score: 0.10199713343522002


## Define compute_metrics

## Train and Evaluate

In [None]:
from transformers import Trainer, TrainingArguments

# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite to output directory
    num_train_epochs= 4,              # number of training epochs
    per_device_train_batch_size = 3,   # batch size
    save_steps=1000,                 # save model per steps
    save_total_limit=2,              # total number of saved models
    logging_dir='./logs',            # log directory
    logging_steps=100,               # save log per steps
    #prediction_loss_only=True,      # predict the loss only, default as False
    learning_rate=5e-3,              # learning rate
    warmup_steps=500,                # warmup steps
)

# Initializing trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # to use specified dataset
    eval_dataset=eval_dataset,  # to designate evaluation dataset if needed
    #compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,1.5954
200,1.534
300,1.3813
400,1.3365
500,1.3467
600,1.1775
700,1.173
800,1.0965
900,0.7926
1000,0.7272


TrainOutput(global_step=1052, training_loss=1.188945436658968, metrics={'train_runtime': 477.7991, 'train_samples_per_second': 6.589, 'train_steps_per_second': 2.202, 'total_flos': 730887446593536.0, 'train_loss': 1.188945436658968, 'epoch': 4.0})

In [None]:
import torch
torch.cuda.empty_cache()
trainer.evaluate()

{'eval_loss': 1.1301360130310059,
 'eval_runtime': 2.7244,
 'eval_samples_per_second': 32.301,
 'eval_steps_per_second': 4.038,
 'epoch': 4.0}

## Save

In [15]:
# Set drive location as /mydrive
!ln -s /content/drive/MyDrive/ /mydrive
# See inside of /mdrive folder
!ls /mydrive
from google.colab import drive
drive.mount('/content/drive')

/mydrive
Mounted at /content/drive


In [None]:
# Save model and tokenizer
model.save_pretrained('./drive/MyDrive')
tokenizer.save_pretrained('./drive/MyDrive')

('./drive/MyDrive/tokenizer_config.json',
 './drive/MyDrive/special_tokens_map.json',
 './drive/MyDrive/vocab.json',
 './drive/MyDrive/merges.txt',
 './drive/MyDrive/added_tokens.json')

## Call model for evaluation

In [27]:
# Perplexity evaluation
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')
perplexity = evaluate_perplexity(model, tokenizer, eval_dataset)
print(f"Perplexity after training: {perplexity}")

print("BLEU after training:")
evaluate_bleu(model, tokenizer, eval_dataset)

Perplexity after training: 3.0960609912872314
ROUGE after training:
BLEU after training:
Smoothed BLEU Score: 0.10199713343522002


In [28]:
print("ROUGE after training:")
evaluate_rouge(model, tokenizer, eval_dataset)

ROUGE after training:


{'rouge1': {'f': 0.05250596658711217,
  'p': 0.05250596658711217,
  'r': 0.05250596658711217},
 'rouge2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
 'rougeL': {'f': 0.05250596658711217,
  'p': 0.05250596658711217,
  'r': 0.05250596658711217}}

## Use with GUI windows

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [None]:
# 加载微调后的模型和分词器
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')

# 创建文本生成pipeline，显式地指定模型和分词器
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# test the function
print(generator("a kind of saulty cocktail", max_length=100))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'a kind of saulty cocktail that calls for 1/4 cup Brandy, 0.25 ounce dry vermouth, 5 to 6 fresh cherries, 2 dashes Angostura bitters, Garnish with pineapple and an orange slice, and maraschino cherry, Lemon peel spiral. '}]


### Create GUI windows(don't run in colab due to lack of environment)

In [None]:
import tkinter as tk

def commit_requirements():
  commands = entry.get()
  recommendation = generator(command, max_length=200)
  label2.config(text=f"Here is the recommandation:\n{recipe_contexts}")

# create the main window
root = tk.Tk()
root.title("Cocktail Asistant")
# label1
label1 = tk.Label(root, text="Welcome to Cocktail Asistant! Please write your command:")
label1.pack(pady=10)
# entry widget
entry = tk.Entry(root)
entry.pack(pady=10)
# button
button = tk.Button(root, text="commit", command=commit_requirements)
button.pack(pady=10)
# label2
label2 = tk.Label(root, text="I'll offer a recipe for you")
label2.pack(pady=10)
# start the event loop
root.mainloop()

In [None]:
# Set drive location as /mydrive
!ln -s /content/drive/MyDrive/ /mydrive
# See inside of /mdrive folder
!ls /mydrive
from google.colab import drive
drive.mount('/content/drive')


/mydrive
Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')
model.save_pretrained('./drive/MyDrive')
tokenizer.save_pretrained('./drive/MyDrive')