In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/food-com-recipes-and-user-interactions/RAW_interactions.csv
/kaggle/input/food-com-recipes-and-user-interactions/ingr_map.pkl
/kaggle/input/food-com-recipes-and-user-interactions/PP_recipes.csv
/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv
/kaggle/input/food-com-recipes-and-user-interactions/interactions_train.csv
/kaggle/input/food-com-recipes-and-user-interactions/interactions_test.csv
/kaggle/input/food-com-recipes-and-user-interactions/PP_users.csv
/kaggle/input/food-com-recipes-and-user-interactions/interactions_validation.csv


In [2]:
# Read data
import pickle

ingr_map = pd.read_pickle("/kaggle/input/food-com-recipes-and-user-interactions/ingr_map.pkl")
input_path = "/kaggle/input/food-com-recipes-and-user-interactions/"
df_recipes = pd.read_csv(input_path + "RAW_recipes.csv")

In [3]:
# Sort the DataFrame by 'count' column in descending order
sorted_ingr_map = ingr_map.sort_values(by='count', ascending=False)

# Get the top 500 entries
top_500_ingr_map = sorted_ingr_map.head(2500)

# Extract and print unique values from the 'replaced' column
unique_replaced = top_500_ingr_map['replaced'].unique()

print(len(unique_replaced))

565


In [4]:
import ast

# Create a set of unique replaced ingredients for quick lookup
unique_replaced_set = set(unique_replaced)

replacement_dict = pd.Series(ingr_map['replaced'].values, index=ingr_map['raw_ingr']).to_dict()

# Function to convert and replace ingredients
def convert_and_replace(ingredient_str, replacement_dict):
    try:
        # Convert string representation of list to actual list
        ingredient_list = ast.literal_eval(ingredient_str)
        # Replace ingredients
        replaced_list = [replacement_dict.get(ingredient, ingredient) for ingredient in ingredient_list]
        return replaced_list
    except (ValueError, SyntaxError):
        # Handle cases where conversion fails
        return []

# Apply replacement to the ingredients column
df_recipes['ingredients'] = df_recipes['ingredients'].apply(lambda x: convert_and_replace(x, replacement_dict))

# Remove rows where any ingredient after replacement is not in unique_replaced
df_recipes = df_recipes[df_recipes['ingredients'].apply(lambda x: all(ingredient in unique_replaced_set for ingredient in x))]

# Inspect the updated DataFrame
df_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
6,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"[fennel seed, green olife, olife, garlic, pepp...",9
9,beat this banana bread,75452,70,15892,2003-11-04,"['weeknight', 'time-to-make', 'course', 'main-...","[2669.3, 160.0, 976.0, 107.0, 62.0, 310.0, 138.0]",12,"['preheat oven to 350 degrees', 'butter two 9x...",from ann hodgman's,"[sugar, unsalted butter, banana, egg, fresh le...",9
18,chinese chop suey,8559,70,4481,2001-01-27,"['weeknight', 'time-to-make', 'course', 'main-...","[395.4, 31.0, 20.0, 29.0, 51.0, 33.0, 8.0]",8,"['brown ground meat and onion in a large pot',...",easy one-pot dinner.,"[celery, onion, ground pork, soy sauce, beef b...",7
23,deep fried dessert thingys,107699,20,158966,2005-01-05,"['30-minutes-or-less', 'time-to-make', 'course...","[1663.3, 221.0, 168.0, 66.0, 19.0, 158.0, 29.0]",20,"['in a large bowl , mix flour , granulated sug...",my mother used to make this for us as a specia...,"[all-purpose flmy, granulated sugar, baking po...",13
24,easiest ever hollandaise sauce,49262,25,64428,2002-12-19,"['30-minutes-or-less', 'time-to-make', 'course...","[1290.4, 213.0, 4.0, 53.0, 22.0, 417.0, 1.0]",7,['cut the butter into several pieces and bring...,the secret to this easy hollandaise sauce is i...,"[butter, lemon, juice of, salt, white pepper, ...",5


In [5]:
# Combine the relevant columns into a new dataframe
df_simplified = df_recipes[['minutes', 'ingredients', 'steps']].copy()

# Format the ingredients as a comma-separated string for better readability
df_simplified['ingredients'] = df_simplified['ingredients'].apply(lambda x: ', '.join(x))

# Define a separator token for individual ingredients
ingredient_separator = " <sep> "

# Format each recipe entry with start and end tokens for each section
def format_recipe_with_tokens(row):
    # Convert ingredients list into a string with <sep> token between each ingredient
    ingredients = ingredient_separator.join(row['ingredients'].split(', '))
    
    # Format the recipe with start and end tokens
    return (
        f"<start-time> {row['minutes']} minutes <end-time> "
        f"<start-ingredients> {ingredients} <end-ingredients> "
        f"<start-steps> {row['steps']} <end-steps>"
    )

# Apply the formatting function to each row
df_simplified['formatted_recipe_with_tokens'] = df_simplified.apply(format_recipe_with_tokens, axis=1)

In [6]:
df_simplified.iloc[0]['formatted_recipe_with_tokens']

"<start-time> 15 minutes <end-time> <start-ingredients> fennel seed <sep> green olife <sep> olife <sep> garlic <sep> peppercorn <sep> orange rind <sep> orange juice <sep> chile <sep> olive oil <end-ingredients> <start-steps> ['toast the fennel seeds and lightly crush them', 'place all the ingredients in a bowl , stir well', 'cover and leave to marinate', 'keep refrigerated and use within 1 to 2 days'] <end-steps>"

In [7]:
recipes = df_simplified['formatted_recipe_with_tokens'].tolist()

from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define special tokens, including separate pad and eos tokens
special_tokens = {
    'eos_token': '<eos>',
    'unk_token': '<unk>',
    'pad_token': '<pad>',
    'additional_special_tokens': [
        '<start-time>', '<end-time>', 
        '<start-ingredients>', '<end-ingredients>', 
        '<start-steps>', '<end-steps>', '<sep>'
    ]
}

# Add the special tokens to the tokenizer vocabulary
tokenizer.add_special_tokens(special_tokens)

# Verify tokens
print("Special Tokens:", tokenizer.special_tokens_map)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special Tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start-time>', '<end-time>', '<start-ingredients>', '<end-ingredients>', '<start-steps>', '<end-steps>', '<sep>']}




In [8]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50267, 768)

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class RecipeDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        # Extract input IDs and attention mask, ensure labels align with input_ids
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        
        # Set labels to input_ids, masking out pad tokens
        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Create dataset and dataloader
max_length = 512
dataset = RecipeDataset(df_simplified['formatted_recipe_with_tokens'].tolist(), tokenizer, max_length=max_length)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [10]:
# Move model to device (GPU if available)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

print("Training completed!")

Epoch 1/3, Loss: 1.4535237550735474
Epoch 2/3, Loss: 1.7250399589538574
Epoch 3/3, Loss: 1.7913089990615845
Training completed!


In [11]:
model.save_pretrained("trained_gpt2_model")
tokenizer.save_pretrained("trained_gpt2_model")

('trained_gpt2_model/tokenizer_config.json',
 'trained_gpt2_model/special_tokens_map.json',
 'trained_gpt2_model/vocab.json',
 'trained_gpt2_model/merges.txt',
 'trained_gpt2_model/added_tokens.json')

In [12]:
model = GPT2LMHeadModel.from_pretrained("trained_gpt2_model").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("trained_gpt2_model")

onions, tomatoes, chicken stock, butter

# Define your input
input_text = (
    "<start-time> 30 minutes <end-time> "
    "<start-ingredients> onions <sep> tomatoes <sep> chicken stock <sep> butter <end-ingredients> "
    "<start-steps>"
)

# Encode input and generate output
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# Generate steps
outputs = model.generate(input_ids, max_length=512, top_k=10, top_p=0.9, do_sample=True)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Recipe Steps:", generated_text)

SyntaxError: invalid syntax (4278365561.py, line 4)

In [None]:
# recipes = df_simplified['formatted_recipe_with_tokens'].tolist()

# from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
# import torch

# # Load the pre-trained GPT-2 model and tokenizer
# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

# # Set eos_token as pad_token
# tokenizer.pad_token = tokenizer.eos_token

# # Tokenize the dataset
# inputs = tokenizer(recipes, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
# dataset = torch.utils.data.TensorDataset(inputs["input_ids"], inputs["attention_mask"])

In [None]:
# from torch.utils.data import Dataset
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# class RecipeDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __getitem__(self, idx):
#         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
#         return item

#     def __len__(self):
#         return len(self.encodings['input_ids'])

# # Create the dataset
# dataset = RecipeDataset(inputs)

In [None]:
# # Define data collator
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# import wandb

# wandb.login(key='c65824cada15b84e31e7e9f35db6fe50eecfd857')

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=2,  # Adjust based on your GPU's capacity
#     save_steps=1000,
#     save_total_limit=2,
#     prediction_loss_only=True
# )

# # Use the Trainer class to fine-tune the model
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset,
#     data_collator=data_collator
# )

# # Train the model
# trainer.train()

# # Save the fine-tuned model
# model.save_pretrained("fine-tuned-gpt2")
# tokenizer.save_pretrained("fine-tuned-gpt2")

In [None]:
# import os

# # Define the path
# path = '/kaggle/working/'

# # Walk through the directory and print files and subdirectories
# for root, dirs, files in os.walk(path):
#     print(f"Root directory: {root}")
#     for directory in dirs:
#         print(f"Subdirectory: {directory}")
#     for file in files:
#         print(f"File: {file}")

In [None]:
# def generate_recipe(model, tokenizer, cooking_time, available_ingredients, max_length=512):
#     # Prepare the prompt
#     prompt = f"Cooking Time: {cooking_time} minutes Ingredients: {available_ingredients} Steps:"
    
#     # Tokenize the prompt
#     inputs = tokenizer(prompt, return_tensors='pt')
    
#     # Check the device of the model and move inputs to the same device
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     inputs = {key: value.to(device) for key, value in inputs.items()}
    
#     # Generate the recipe
#     outputs = model.generate(
#         inputs['input_ids'],
#         attention_mask=inputs['attention_mask'],
#         do_sample=True,
#         max_length=max_length,
#         num_return_sequences=1,
#         no_repeat_ngram_size=2,
#         top_p=0.95,
#         temperature=0.7
#     )
    
#     # Decode the output
#     recipe = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     return recipe

# # Example usage
# cooking_time = 30
# available_ingredients = "onions, tomatoes, chicken stock, butter"
# generated_recipe = generate_recipe(model, tokenizer, cooking_time, available_ingredients)
# print(generated_recipe)