# Load Dataset

In [10]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('/content/unique_prompts_generated_recipes_v2.csv')


In [11]:
data

Unnamed: 0,Prompt,Generated Recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,..."
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ..."
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre..."
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on..."
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b..."
...,...,...
2995,Generate a vegetarian recipe for dinner with t...,"Dish: Ingredients: tomato, soy sauce, spinach,..."
2996,Generate a dairy-free recipe for dinner with e...,"Dish: Ingredients: eggplant, rice, avocado, ol..."
2997,Generate a dairy-free recipe for dinner with p...,"Dish: Ingredients: potato, olive oil, lemongra..."
2998,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, eggplant, pasta, se..."


# Data Cleaning

In [12]:
# Clean the text: Remove special characters and lowercasing
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text

data['cleaned_prompt'] = data['Prompt'].apply(clean_text)
data['cleaned_recipe'] = data['Generated Recipe'].apply(clean_text)

# Split the data into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


data

Unnamed: 0,Prompt,Generated Recipe,cleaned_prompt,cleaned_recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,...",generate a dairyfree recipe for lunch with yog...,dish ingredients yogurt chickpeas spinach carr...
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ...",generate a dairyfree recipe for dinner with gi...,dish ingredients ginger olive oil tomato spina...
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre...",generate a vegetarian recipe for dinner with c...,dish ingredients cucumber potato tofu breadcru...
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on...",generate a dairyfree recipe for lunch with len...,dish ingredients lentils basil spinach onion i...
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b...",generate a vegetarian recipe for dinner with b...,dish ingredients basil lemongrass pasta breadc...
...,...,...,...,...
2995,Generate a vegetarian recipe for dinner with t...,"Dish: Ingredients: tomato, soy sauce, spinach,...",generate a vegetarian recipe for dinner with t...,dish ingredients tomato soy sauce spinach chic...
2996,Generate a dairy-free recipe for dinner with e...,"Dish: Ingredients: eggplant, rice, avocado, ol...",generate a dairyfree recipe for dinner with eg...,dish ingredients eggplant rice avocado olive o...
2997,Generate a dairy-free recipe for dinner with p...,"Dish: Ingredients: potato, olive oil, lemongra...",generate a dairyfree recipe for dinner with po...,dish ingredients potato olive oil lemongrass c...
2998,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, eggplant, pasta, se...",generate a dairyfree recipe for dinner with to...,dish ingredients tomato eggplant pasta sesame ...


# Tokenization

In [13]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset


# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


# Tokenize the input prompts and generated recipes
train_encodings = tokenizer(list(train_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
train_labels = tokenizer(list(train_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)

test_encodings = tokenizer(list(test_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
test_labels = tokenizer(list(test_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)

# Create a custom dataset for use in the Trainer
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels['input_ids']
})


In [15]:

test_dataset[0]

{'input_ids': [3806,
  3,
  9,
  13688,
  2113,
  2696,
  21,
  3074,
  28,
  12784,
  9417,
  3702,
  12909,
  24395,
  3,
  16217,
  21659,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [4419,
  3018,
  12784,
  9417,
  3702,
  12909,
  24395,
  3909,
  5148,
  12784,
  9417,
  3702,
  12909,
  617,
  24395,
  3989,
  8583,
  11,
  1716,
  1312,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

# Fine Tuning

In [17]:

# Set up training arguments with validation logging
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=5,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Warmup steps
    weight_decay=0.01,  # Weight decay
    logging_dir='./logs',  # Logging directory
    logging_steps=10,
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=1000,  # Save model checkpoints every 1000 steps
    load_best_model_at_end=True,  # Load the best model when finished
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
)

# Fine-tune the model
trainer.train()



Step,Training Loss,Validation Loss
500,0.0182,0.000177
1000,0.0032,3.5e-05
1500,0.0009,2.5e-05


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1500, training_loss=0.6390339790104578, metrics={'train_runtime': 149.1025, 'train_samples_per_second': 80.482, 'train_steps_per_second': 10.06, 'total_flos': 91990130688000.0, 'train_loss': 0.6390339790104578, 'epoch': 5.0})

# Save Tuned Model

In [18]:
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_t5_recipe_model')
tokenizer.save_pretrained('./fine_tuned_t5_recipe_model')

('./fine_tuned_t5_recipe_model/tokenizer_config.json',
 './fine_tuned_t5_recipe_model/special_tokens_map.json',
 './fine_tuned_t5_recipe_model/spiece.model',
 './fine_tuned_t5_recipe_model/added_tokens.json')

# Recipe Generation System

In [21]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('./fine_tuned_t5_recipe_model')
tokenizer = T5Tokenizer.from_pretrained('./fine_tuned_t5_recipe_model')

# Set the device to GPU if available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device (GPU/CPU)
model.to(device)

# Function to generate a recipe from a prompt
def generate_recipe(prompt, model, tokenizer, max_length=150):
    prompt = clean_text(prompt)  # Clean the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate the recipe
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the recipe generation
prompt = "Generate a vegetarian recipe for dinner with tomatoes and spinach"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients tomatoes and spinach instructions combine tomatoes and spinach add spinach cook thoroughly and serve hot


In [22]:
# Test the recipe generation with a different prompt
prompt = "Generate a vegan dessert recipe with chocolate and almonds"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients chocolate and almonds instructions combine chocolate and almonds add almonds cook thoroughly and serve hot


In [23]:
# Test the recipe generation with another prompt
prompt = "Generate a gluten-free recipe for breakfast with eggs and avocado"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients eggs and avocado instructions combine eggs and avocado add avocado cook thoroughly and serve hot
