<a href="https://colab.research.google.com/github/Panda-22/LLM-Team2/blob/main/llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]



In [2]:
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset
import torch

class MyDataset(Dataset):
    def __init__(self, tokenizer, recipes, block_size):
        self.tokenizer = tokenizer
        self.samples = []

        for recipe in recipes:
            text = recipe['title'] + " "  # Start with the title
            text += "Ingredients: " + ', '.join(recipe['ingredients']) + ". "  # Add ingredients
            text += "Directions: " + ' '.join(recipe['directions']) + ". "  # Add directions

            # 使用tokenizer的__call__方法，这会自动处理padding等
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")

            # 将tokenized_text添加到samples
            self.samples.append(tokenized_text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # 此处直接返回字典，包含input_ids和attention_mask，以及labels
        sample = self.samples[idx]
        # 对于语言模型，labels通常与input_ids相同
        sample["labels"] = sample["input_ids"].clone()
        return {key: value.squeeze(0) for key, value in sample.items()}

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# Load dataset
# Assuming `dataset` is a list of dictionaries with the given structure
# This part depends on how your data is actually loaded
dataset = load_dataset('brianarbuckle/cocktail_recipes', split='train')

# Prepare the list of recipe texts
# Assuming `dataset` yields dictionaries directly
recipes = [example for example in dataset]

# 将分词器的pad_token设置为eos_token
tokenizer.pad_token = tokenizer.eos_token

# 现在再次创建数据集实例
my_dataset = MyDataset(tokenizer, recipes, block_size=128)


In [None]:
from transformers import GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

# loading pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite to output directory
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=4,   # batch size
    save_steps=1000,                 # save model per steps
    save_total_limit=2,              # total number of saved models
    logging_dir='./logs',            # log directory
    logging_steps=100,               # save log per steps
    prediction_loss_only=True,       # predict the loss only
    learning_rate=5e-5,              # learning rate
    warmup_steps=500,                # warmup steps
)

# Initializing trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = my_dataset,  # to use specified dataset
    # eval_dataset=eval_dataset,  # to designate evaluation dataset if needed
)

# start training
trainer.train()

# to save the fine-tuned model
model.save_pretrained('./fine_tuned_model')

# to save the tokenizer to the same folder
tokenizer.save_pretrained('./fine_tuned_model')

# loading fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')

# generation pipeline

INPUT_TXT = 'text-generation'
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# to generate text
print(generator("Example prompt", max_length=100))


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,3.2299
200,1.0615
300,0.9999
400,0.8817


Step,Training Loss
100,3.2299
200,1.0615
300,0.9999
400,0.8817


In [None]:
generator = pipeline('text-generation', model='./fine_tuned_model')
set_seed(41)

# 创建文本生成pipeline，显式地指定模型和分词器
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# 生成文本
print(generator("whiskey", max_length=100))

In [None]:
generator = pipeline('text-generation', model='./fine_tuned_model')
set_seed(41)

# 创建文本生成pipeline，显式地指定模型和分词器
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# 生成文本
print(generator("Final Ward", max_length=100))
