<a href="https://colab.research.google.com/github/Panda-22/LLM-Team2/blob/main/GPT2_fine_tuning_cocktail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune GPT2 with Eval_dataset

## Load Dataset

In [1]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install evaluate
!pip install rouge-score

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [2]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset('brianarbuckle/cocktail_recipes', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/3.33k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/96.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/875 [00:00<?, ? examples/s]

## Load Tokenizer

In [3]:
from transformers import GPT2Tokenizer

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

## Preprocess the Dataset

In [4]:
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):

    def __init__(self, tokenizer, recipes, block_size):
        self.tokenizer = tokenizer
        self.samples = []

        for recipe in recipes:
            text = recipe['title'] + " "  # Start with the title
            text += "Ingredients: " + ', '.join(recipe['ingredients']) + ". "  # Add ingredients
            text += "Directions: " + ' '.join(recipe['directions']) + ". "  # Add directions

            # call __call__ from tokenizer for automatic padding
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")

            # adding tokenized_text to samples
            self.samples.append(tokenized_text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # return a dict: input_ids, attention_mask and labels
        sample = self.samples[idx]
        # for language models, labels equal to input_ids in general
        sample["labels"] = sample["input_ids"].clone()
        return {key: value.squeeze(0) for key, value in sample.items()}

# Prepare the list of recipe texts
# 'dataset' yields dictionaries directly
recipes = [example for example in dataset]

# recreate dataset
my_dataset = MyDataset(tokenizer, recipes, block_size=128)

## Split the Dataset

In [5]:
from sklearn.model_selection import train_test_split
# divide the dataset into training and testing parts
train_dataset, eval_dataset = train_test_split(my_dataset, test_size=0.1, random_state=42)

## Load Model (GPT2-meduim)

In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
#import evaluate

# Define perplexity
def evaluate_perplexity(model, tokenizer, dataloader):
    model.eval()
    total_loss = 0
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        with torch.no_grad():
            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss
            total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    perplexity = torch.exp(torch.tensor(average_loss))
    return perplexity.item()

# Define BLEU
from nltk.translate.bleu_score import sentence_bleu
def evaluate_bleu(model, tokenizer, dataloader):
    model.eval()
    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        outputs = model(inputs, labels = labels, attention_mask = attention_mask)
        #bleu = evaluate.load("bleu")
        #results = bleu.compute(predictions = outputs, references = labels)
        results = sentence_bleu(outputs, labels)
    print(results, labels, outputs)

# Define ROUGE
from rouge_score import rouge_scorer
def evaluate_rouge(model, tokenizer, dataloader):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    for batch in dataloader:
        inputs = batch['input_ids']
        labels = batch['labels']
        attention_mask = batch['attention_mask']

        # Generate model outputs
        with torch.no_grad():
            outputs = model(inputs, attention_mask=attention_mask)

        # Convert output tensor to text
        predicted_texts = tokenizer.batch_decode(outputs.logits, skip_special_tokens=True)
        reference_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Calculate ROUGE scores
        for ref_text, pred_text in zip(reference_texts, predicted_texts):
            scores = scorer.score(ref_text, pred_text)
            print(f"ROUGE scores for example:")
            print(f"ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
            print(f"ROUGE-2: {scores['rouge2'].fmeasure:.4f}")
            print(f"ROUGE-L: {scores['rougeL'].fmeasure:.4f}")
            print("-" * 30)
    return scores


In [8]:
from transformers import GPT2LMHeadModel

# loading pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
perplexity = evaluate_perplexity(model, tokenizer, eval_dataset)
print(f"Perplexity before training: {perplexity}")

Perplexity before training: 1461.490234375


## Define compute_metrics

## Train and Evaluate

In [11]:
from transformers import Trainer, TrainingArguments

# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite to output directory
    num_train_epochs= 4,              # number of training epochs
    per_device_train_batch_size = 3,   # batch size
    save_steps=1000,                 # save model per steps
    save_total_limit=2,              # total number of saved models
    logging_dir='./logs',            # log directory
    logging_steps=100,               # save log per steps
    #prediction_loss_only=True,      # predict the loss only, default as False
    learning_rate=5e-3,              # learning rate
    warmup_steps=500,                # warmup steps
)

# Initializing trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # to use specified dataset
    eval_dataset=eval_dataset,  # to designate evaluation dataset if needed
    #compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,1.5954
200,1.534
300,1.3813
400,1.3365
500,1.3467
600,1.1775
700,1.173
800,1.0965
900,0.7926
1000,0.7272


TrainOutput(global_step=1052, training_loss=1.188945436658968, metrics={'train_runtime': 477.7991, 'train_samples_per_second': 6.589, 'train_steps_per_second': 2.202, 'total_flos': 730887446593536.0, 'train_loss': 1.188945436658968, 'epoch': 4.0})

In [12]:
import torch
torch.cuda.empty_cache()
trainer.evaluate()

{'eval_loss': 1.1301360130310059,
 'eval_runtime': 2.7244,
 'eval_samples_per_second': 32.301,
 'eval_steps_per_second': 4.038,
 'epoch': 4.0}

## Save

In [13]:
# Set drive location as /mydrive
!ln -s /content/drive/MyDrive/ /mydrive
# See inside of /mdrive folder
!ls /mydrive
from google.colab import drive
drive.mount('/content/drive')

/mydrive
Mounted at /content/drive


In [14]:
# Save model and tokenizer
model.save_pretrained('./drive/MyDrive')
tokenizer.save_pretrained('./drive/MyDrive')

('./drive/MyDrive/tokenizer_config.json',
 './drive/MyDrive/special_tokens_map.json',
 './drive/MyDrive/vocab.json',
 './drive/MyDrive/merges.txt',
 './drive/MyDrive/added_tokens.json')

## Call model for evaluation

In [15]:
# Perplexity evaluation
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')
perplexity = evaluate_perplexity(model, tokenizer, eval_dataset)
print(f"Perplexity after training: {perplexity}")

Perplexity after training: 3.096069812774658


In [None]:
# ROUGE evaluation
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')

rouge = evaluate_rouge(model, tokenizer, eval_dataset)
print(f"ROUGE after training: {rouge}")

In [None]:
# BLEU evaluation
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')

bleu = evaluate_bleu(model, tokenizer, eval_dataset)
print(f"BLEU after training: {bleu}")

## Use with GUI windows

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [None]:
# 加载微调后的模型和分词器
model = GPT2LMHeadModel.from_pretrained('./drive/MyDrive')
tokenizer = GPT2Tokenizer.from_pretrained('./drive/MyDrive')

# 创建文本生成pipeline，显式地指定模型和分词器
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# test the function
print(generator("a kind of saulty cocktail", max_length=100))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'a kind of saulty cocktail that calls for 1/4 cup Brandy, 0.25 ounce dry vermouth, 5 to 6 fresh cherries, 2 dashes Angostura bitters, Garnish with pineapple and an orange slice, and maraschino cherry, Lemon peel spiral. '}]


### Create GUI windows(don't run in colab due to lack of environment)

In [None]:
import tkinter as tk

def commit_requirements():
  commands = entry.get()
  recommendation = generator(command, max_length=200)
  label2.config(text=f"Here is the recommandation:\n{recipe_contexts}")

# create the main window
root = tk.Tk()
root.title("Cocktail Asistant")
# label1
label1 = tk.Label(root, text="Welcome to Cocktail Asistant! Please write your command:")
label1.pack(pady=10)
# entry widget
entry = tk.Entry(root)
entry.pack(pady=10)
# button
button = tk.Button(root, text="commit", command=commit_requirements)
button.pack(pady=10)
# label2
label2 = tk.Label(root, text="I'll offer a recipe for you")
label2.pack(pady=10)
# start the event loop
root.mainloop()