In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
# Read data
import pickle

ingr_map = pd.read_pickle("/kaggle/input/food-com-recipes-and-user-interactions/ingr_map.pkl")
input_path = "/kaggle/input/food-com-recipes-and-user-interactions/"
df_recipes = pd.read_csv(input_path + "RAW_recipes.csv")

In [3]:
import ast

replacement_dict = pd.Series(ingr_map['replaced'].values, index=ingr_map['raw_ingr']).to_dict()

# Function to convert and replace ingredients
def convert_and_replace(ingredient_str, replacement_dict):
    try:
        # Convert string representation of list to actual list
        ingredient_list = ast.literal_eval(ingredient_str)
        # Replace ingredients
        replaced_list = [replacement_dict.get(ingredient, ingredient) for ingredient in ingredient_list]
        return replaced_list
    except (ValueError, SyntaxError):
        # Handle cases where conversion fails
        return []

# Apply replacement to the ingredients column
df_recipes['ingredients'] = df_recipes['ingredients'].apply(lambda x: convert_and_replace(x, replacement_dict))

In [4]:
df_recipes = df_recipes.iloc[115818:173727]

# Combine the relevant columns into a new dataframe
df_simplified = df_recipes[['minutes', 'ingredients', 'steps']].copy()

# Format the ingredients as a comma-separated string for better readability
df_simplified['ingredients'] = df_simplified['ingredients'].apply(lambda x: ', '.join(x))

# Define a separator token for individual ingredients
ingredient_separator = " <sep> "

# Format each recipe entry with start and end tokens for each section
def format_recipe_with_tokens(row):
    # Convert ingredients list into a string with <sep> token between each ingredient
    ingredients = ingredient_separator.join(row['ingredients'].split(', '))
    
    # Format the recipe with start and end tokens
    return (
        f"<start-time> {row['minutes']} minutes <end-time> "
        f"<start-ingredients> {ingredients} <end-ingredients> "
        f"<start-steps> {row['steps']} <end-steps>"
    )

# Apply the formatting function to each row
df_simplified['formatted_recipe_with_tokens'] = df_simplified.apply(format_recipe_with_tokens, axis=1)

In [5]:
df_simplified.iloc[0]['formatted_recipe_with_tokens']

"<start-time> 5 minutes <end-time> <start-ingredients> milk <sep> instant breakfast drink mix <sep> banana <sep> chocolate syrup <sep> peanut butter <sep> honey <sep> vanilla <sep> egg <sep> ice cube <end-ingredients> <start-steps> ['slice banana and put all ingredients into the blender', 'blend until smooth'] <end-steps>"

In [6]:
recipes = df_simplified['formatted_recipe_with_tokens'].tolist()

from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("/kaggle/input/gpt-try-3/trained_gpt2_model")

# Define special tokens, including separate pad and eos tokens
special_tokens = {
    'eos_token': '<eos>',
    'unk_token': '<unk>',
    'pad_token': '<pad>',
    'additional_special_tokens': [
        '<start-time>', '<end-time>', 
        '<start-ingredients>', '<end-ingredients>', 
        '<start-steps>', '<end-steps>', '<sep>'
    ]
}

# Add the special tokens to the tokenizer vocabulary
tokenizer.add_special_tokens(special_tokens)

# Verify tokens
print("Special Tokens:", tokenizer.special_tokens_map)

Special Tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start-time>', '<end-time>', '<start-ingredients>', '<end-ingredients>', '<start-steps>', '<end-steps>', '<sep>']}


In [7]:
model = GPT2LMHeadModel.from_pretrained("/kaggle/input/gpt-try-3/trained_gpt2_model")
model.resize_token_embeddings(len(tokenizer))

Embedding(50267, 768)

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class RecipeDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        # Extract input IDs and attention mask, ensure labels align with input_ids
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        
        # Set labels to input_ids, masking out pad tokens
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Create dataset and dataloader
max_length = 512
dataset = RecipeDataset(df_simplified['formatted_recipe_with_tokens'].tolist(), tokenizer, max_length=max_length)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [9]:
# Move model to device (GPU if available)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 5  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

print("Training completed!")

Epoch 1/5, Loss: 1.3142313957214355
Epoch 2/5, Loss: 1.411244511604309
Epoch 3/5, Loss: 1.2312748432159424
Epoch 4/5, Loss: 1.0986993312835693
Epoch 5/5, Loss: 1.5003923177719116
Training completed!


In [10]:
model.save_pretrained("trained_gpt2_model")
tokenizer.save_pretrained("trained_gpt2_model")

('trained_gpt2_model/tokenizer_config.json',
 'trained_gpt2_model/special_tokens_map.json',
 'trained_gpt2_model/vocab.json',
 'trained_gpt2_model/merges.txt',
 'trained_gpt2_model/added_tokens.json')

In [11]:
model = GPT2LMHeadModel.from_pretrained("trained_gpt2_model").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("trained_gpt2_model")

# Define your input
input_text = (
    "<start-time> 30 minutes <end-time> "
    "<start-ingredients> carrot <sep> beans <sep> rice <sep> corn <sep> garlic <end-ingredients> "
    "<start-steps>"
)

# Encode input and generate output
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# Generate steps
outputs = model.generate(input_ids, max_length=512, top_k=10, top_p=0.9, do_sample=True)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

print("Generated Recipe Steps:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Recipe Steps: <start-time>  30 minutes  <end-time>   <start-ingredients>  carrot  <sep>  beans  <sep>  rice  <sep>  corn  <sep>  garlic  <end-ingredients>   <start-steps>  ['put rice in a large pot and add 2 cups of water', 'bring to a boil', 'when boiling add carrots, beans, and rice', 'cook on low until all ingredients are tender', 'add corn and garlic and mix well']  <end-steps> id: if you like, add more rice to make 1 cup']  <end-steps> id: add more rice to make 1 cup', 'also, i also added chopped onions to make 1 cup of onions']  <end-steps> id: add more rice to make 1 cup of rice', 'also, i added some chopped carrots for flavor']  <end-steps> id: add some chopped chicken or pork, some chopped tomatoes, some sliced green peppers for flavor', 'also, you can add a can of chopped chicken', 'also, you can add some chopped green olives for flavor']  <end-steps> id: just add some fresh parsley to make a garnish']  <end-steps> id: you can add some sliced cucumbers or diced toma

In [12]:
# recipes = df_simplified['formatted_recipe_with_tokens'].tolist()

# from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
# import torch

# # Load the pre-trained GPT-2 model and tokenizer
# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

# # Set eos_token as pad_token
# tokenizer.pad_token = tokenizer.eos_token

# # Tokenize the dataset
# inputs = tokenizer(recipes, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
# dataset = torch.utils.data.TensorDataset(inputs["input_ids"], inputs["attention_mask"])

In [13]:
# from torch.utils.data import Dataset
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# class RecipeDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __getitem__(self, idx):
#         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
#         return item

#     def __len__(self):
#         return len(self.encodings['input_ids'])

# # Create the dataset
# dataset = RecipeDataset(inputs)

In [14]:
# # Define data collator
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# import wandb

# wandb.login(key='c65824cada15b84e31e7e9f35db6fe50eecfd857')

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=2,  # Adjust based on your GPU's capacity
#     save_steps=1000,
#     save_total_limit=2,
#     prediction_loss_only=True
# )

# # Use the Trainer class to fine-tune the model
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset,
#     data_collator=data_collator
# )

# # Train the model
# trainer.train()

# # Save the fine-tuned model
# model.save_pretrained("fine-tuned-gpt2")
# tokenizer.save_pretrained("fine-tuned-gpt2")

In [15]:
# import os

# # Define the path
# path = '/kaggle/working/'

# # Walk through the directory and print files and subdirectories
# for root, dirs, files in os.walk(path):
#     print(f"Root directory: {root}")
#     for directory in dirs:
#         print(f"Subdirectory: {directory}")
#     for file in files:
#         print(f"File: {file}")

In [16]:
# def generate_recipe(model, tokenizer, cooking_time, available_ingredients, max_length=512):
#     # Prepare the prompt
#     prompt = f"Cooking Time: {cooking_time} minutes Ingredients: {available_ingredients} Steps:"
    
#     # Tokenize the prompt
#     inputs = tokenizer(prompt, return_tensors='pt')
    
#     # Check the device of the model and move inputs to the same device
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     inputs = {key: value.to(device) for key, value in inputs.items()}
    
#     # Generate the recipe
#     outputs = model.generate(
#         inputs['input_ids'],
#         attention_mask=inputs['attention_mask'],
#         do_sample=True,
#         max_length=max_length,
#         num_return_sequences=1,
#         no_repeat_ngram_size=2,
#         top_p=0.95,
#         temperature=0.7
#     )
    
#     # Decode the output
#     recipe = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     return recipe

# # Example usage
# cooking_time = 30
# available_ingredients = "onions, tomatoes, chicken stock, butter"
# generated_recipe = generate_recipe(model, tokenizer, cooking_time, available_ingredients)
# print(generated_recipe)