<a href="https://colab.research.google.com/github/Panda-22/LLM-Team2/blob/main/llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]

In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset
import torch

class MyDataset(Dataset):
    def __init__(self, tokenizer, recipes, block_size):
        self.tokenizer = tokenizer
        self.samples = []

        for recipe in recipes:
            text = recipe['title'] + " "  # Start with the title
            text += "Ingredients: " + ', '.join(recipe['ingredients']) + ". "  # Add ingredients
            text += "Directions: " + ' '.join(recipe['directions']) + ". "  # Add directions

            # call __call__ from tokenizer for automatic padding
            tokenized_text = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")

            # adding tokenized_text to samples
            self.samples.append(tokenized_text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # return a dict: input_ids, attention_mask and labels
        sample = self.samples[idx]
        # for language models, labels equal to input_ids in general
        sample["labels"] = sample["input_ids"].clone()
        return {key: value.squeeze(0) for key, value in sample.items()}

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# Load dataset
# Assuming `dataset` is a list of dictionaries with the given structure
# This part depends on how your data is actually loaded
dataset = load_dataset('brianarbuckle/cocktail_recipes', split='train')

# Prepare the list of recipe texts
# Assuming `dataset` yields dictionaries directly
recipes = [example for example in dataset]

# setting pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

# recreate dataset
my_dataset = MyDataset(tokenizer, recipes, block_size=128)



In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# loading pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# Defining training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite to output directory
    num_train_epochs=4,              # number of training epochs
    per_device_train_batch_size=4,   # batch size
    save_steps=1000,                 # save model per steps
    save_total_limit=2,              # total number of saved models
    logging_dir='./logs',            # log directory
    logging_steps=100,               # save log per steps
    prediction_loss_only=True,       # predict the loss only
    learning_rate=5e-5,              # learning rate
    warmup_steps=500,                # warmup steps
)

# Initializing trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = my_dataset,  # to use specified dataset
    # eval_dataset=eval_dataset,  # to designate evaluation dataset if needed
)

# start training
trainer.train()

# to save the fine-tuned model
model.save_pretrained('./fine_tuned_model')

# to save the tokenizer to the same folder
tokenizer.save_pretrained('./fine_tuned_model')

# loading fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')

# creating pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# to generate text
print(generator("Example prompt", max_length=100))


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss
100,3.295
200,1.1177
300,0.9673
400,0.9159
500,0.7654
600,0.7557
700,0.6545
800,0.5707


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Example prompt: When I am not at work, do you think of the night before dinner or the first night I was in this house? It should be an optional choice, but sometimes this simple thing—before dinner—pays huge. Even though many are not traditional dinner-time pousse-café drinks, combining them with a good old-fashioned iced coffee can be the key to a complete night's rest., 2 ounces espresso, 1/2 ounce peach schnapps,"}]


In [None]:
generator = pipeline('text-generation', model='./fine_tuned_model')
set_seed(41)

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

print(generator("whiskey", max_length=100))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'whiskey spritz with amaretto. (iStock) A cocktail from London, named for its famous apothecary, from the 14th century, and originally made frothy with absinthe.. Directions: Shake ingredients with ice. Strain into a chilled cocktail glass.. '}]


In [None]:
generator = pipeline('text-generation', model='./fine_tuned_model')
set_seed(41)

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

print(generator("Final Ward", max_length=100))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Final Ward Ingredients: 1 Part Ginger Liqueur, 3 Parts Pineapple Juice, 2 Parts GrapeFruit Juice. Directions: shake on ice and strain Garnish with twist of Lemon peel. '}]
