In [None]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import gdown
import zipfile
import os
import json
import re

In [None]:
url = 'https://drive.google.com/uc?id=1yIDmIeifB2_rwl6U45F23AAgnpSmnkbc'

# unzip the file and store dataset in recipe_box/
output = 'recipe_box.zip'
gdown.download(url, output, quiet=False)
extract_to = 'recipe_box'
os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [None]:
def clean_text(text):
    # Remove the string "ADVERTISEMENT"
    cleaned_text = re.sub(r'ADVERTISEMENT|\s+', ' ', text).strip()
    return cleaned_text

def clean_dataset(dataset):
    for recipe in dataset:
        if 'ingredients' in recipe:
            recipe['ingredients'] = [clean_text(ingredient) for ingredient in recipe['ingredients'] if ingredient]
        if 'instructions' in recipe and recipe['instructions']:
            recipe['instructions'] = clean_text(recipe['instructions'])
    return dataset

def recipe_validate_required_fields(recipe):
    required_keys = ['title', 'ingredients', 'instructions']
    if not recipe:
        return False
    for required_key in required_keys:
        if not recipe.get(required_key):
            return False
        if isinstance(recipe[required_key], list) and not recipe[required_key]:
            return False
    return True

In [None]:
def load_dataset():
  dataset_file_names = [
    'recipes_raw_nosource_ar.json',
    # 'recipes_raw_nosource_epi.json',
    # 'recipes_raw_nosource_fn.json',
  ]

  dataset = []
  for dataset_file_name in dataset_file_names:
    dataset_file_path = os.path.join('recipe_box', dataset_file_name)
    with open(dataset_file_path, 'r') as dataset_file:
      json_data_dict = json.load(dataset_file)
      json_data_list = list(json_data_dict.values())
      json_data_list = [recipe for recipe in json_data_list if recipe_validate_required_fields(recipe)]
      cleaned_data = clean_dataset(json_data_list)
      dataset.extend(cleaned_data)
      print_dataset_info(dataset_file_path, cleaned_data)

  return dataset

def print_dataset_info(file_path, json_data):
  print(file_path)
  print('samples size:', len(json_data))
  if json_data:
    print('title:', json_data[0]['title'])
    print('ingredients:', json_data[0]['ingredients'])
    print('instructions:', json_data[0]['instructions'])
    print()

dataset_raw = load_dataset()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
vocab_size = 5000
max_sequence_length = 500  # Maximum length of sequences

# Tokenizer configuration
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
all_texts = [recipe['ingredients'] + [recipe['instructions']] for recipe in dataset_raw]
all_texts = [' '.join(texts) for texts in all_texts]  # Combine ingredients and instructions for tokenization
tokenizer.fit_on_texts(all_texts)

# Function to prepare sequences
def prepare_sequences(texts, tokenizer, max_sequence_length):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
    return padded_sequences

# Preparing data
ingredients_seqs = [' '.join(recipe['ingredients']) for recipe in dataset_raw]
instructions_seqs = [recipe['instructions'] for recipe in dataset_raw]

X = prepare_sequences(ingredients_seqs, tokenizer, max_sequence_length)
y = prepare_sequences(instructions_seqs, tokenizer, max_sequence_length)

# Splitting data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)  # Should be (num_samples, sequence_length)
print("y_train shape:", y_train.shape)  # Should also be (num_samples, sequence_length)


X_train shape: (31617, 500)
y_train shape: (31617, 500)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Model configuration

embedding_dim = 256
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    LSTM(256, return_sequences=True),
    # LSTM(32, return_sequences=True),
    Dense(vocab_size, activation='softmax')
])


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Training configuration
epochs = 10
batch_size = 64

# Train the model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                    validation_data=(X_test, y_test))

In [None]:
sample_ingredients = "chicken egg butter"

sample_seq = tokenizer.texts_to_sequences([sample_ingredients])
sample_padded = pad_sequences(sample_seq, maxlen=max_sequence_length, padding='post')

prediction = model.predict(sample_padded)

predicted_instruction_idx = prediction.argmax(axis=-1)[0]
predicted_instruction = tokenizer.sequences_to_texts([predicted_instruction_idx])

print("Generated Cooking Instructions:")
print(predicted_instruction)