In [1]:
!pip install tensorflow




In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import os
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMaskedLM

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install NLTK resources
nltk.download("punkt")
nltk.download("wordnet")

# Set the path to your dataset in Google Drive
dataset_path = '/content/drive/MyDrive/archive (4)/recipe_dataset.csv'

# Read the CSV file with the specified columns
recipes = pd.read_csv(dataset_path, usecols=['name', 'description', 'cuisine', 'course', 'diet', 'ingredients_name', 'ingredients_quantity', 'prep_time (in mins)', 'cook_time (in mins)', 'instructions', 'image_url'])

# Preprocessing ingredients and instructions
recipes['ingredients_name'] = recipes['ingredients_name'].str.lower().str.replace("[^a-z\s]+", " ").str.replace("(\s+|$\s+|\s+^)", " ")

# Lemmatize ingredients
lemmatizer = WordNetLemmatizer()
recipes['ingredients_name'] = recipes['ingredients_name'].fillna('').apply(lambda row: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(row)]))

# Create the DataFrame with special tokens
df = "<RECIPE_START> <INPUT_START> <INGREDIENTS_START> " + recipes['ingredients_name'] + " <INPUT_END> <INSTRUCTIONS_START> " + \
    recipes['instructions'] + " <INSTRUCTIONS_END> <TITLE_START> " + recipes['name'] + " <TITLE_END>"

# Split the dataset into train and test
train, test = train_test_split(df, test_size=0.05)

# Set the path to save the generated data
output_dir = '/content/drive/My Drive/generated_data/'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save ingredients and recipes as text files
np.savetxt(os.path.join(output_dir, 'ingredients.txt'), recipes['ingredients_name'], fmt='%s')
np.savetxt(os.path.join(output_dir, 'recipes.txt'), recipes['instructions'], fmt='%s')


Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [9]:
#Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_series = train.squeeze()
test_series = test.squeeze()
# Tokenize and encode the input sequences
input_sequences = [tokenizer.encode(seq, add_special_tokens=True, max_length=512, truncation=True, padding='max_length') for seq in train_series.str.split('<INPUT_END>').str[0]]
target_sequences = [tokenizer.encode(seq, add_special_tokens=True, max_length=512, truncation=True, padding='max_length') for seq in train_series.str.split('<INSTRUCTIONS_START>').str[1].str.split('<INSTRUCTIONS_END>').str[0]]

In [10]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences)).batch(16)

# Load the BERT model
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')

# Define the training loop
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [11]:
@tf.function
def train_step(input_ids, target_ids):
    with tf.GradientTape() as tape:
        outputs = model(input_ids, labels=target_ids)
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss



In [12]:
# Train the model
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for input_ids, target_ids in train_dataset:
        loss = train_step(input_ids, target_ids)
        total_loss += loss
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: [1683.3115]
Epoch 2, Loss: [1501.5773]
Epoch 3, Loss: [1474.6028]


In [13]:
# Save the trained model
model.save_pretrained('/content/drive/My Drive/generated_data/bert_model')

In [17]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertLMHeadModel

# Load the saved BERT model
model = TFBertForMaskedLM.from_pretrained('/content/drive/My Drive/generated_data/bert_model')

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to preprocess the input ingredients
def preprocess_input(ingredients):
    input_text = "<RECIPE_START> <INPUT_START> <INGREDIENTS_START> " + ingredients + " <INPUT_END>"
    input_ids = tokenizer.encode(input_text, add_special_tokens=True, max_length=512, truncation=True, padding='max_length')
    return input_ids

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at /content/drive/My Drive/generated_data/bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [18]:
# Function to generate instructions
def generate_instructions(ingredients):
    input_ids = preprocess_input(ingredients)
    input_tensor = tf.constant([input_ids])

    output = model.generate(input_tensor, max_length=512, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    instructions = generated_text.split('<INSTRUCTIONS_START>')[1].split('<INSTRUCTIONS_END>')[0]

    return instructions

In [26]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertLMHeadModel

# Load the saved BERT model
model = TFBertLMHeadModel.from_pretrained('/content/drive/My Drive/generated_data/bert_model')

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',padding_side='left')

# Function to preprocess the input ingredients
def preprocess_input(ingredients):
    input_text = "<RECIPE_START> <INPUT_START> <INGREDIENTS_START> " + ingredients + " <INPUT_END>"
    input_ids = tokenizer.encode(input_text, add_special_tokens=True, max_length=512, truncation=True, padding='max_length')
    return input_ids

# Function to generate instructions
def generate_instructions(ingredients):
    input_ids = preprocess_input(ingredients)
    input_tensor = tf.constant([input_ids])

    output = model.generate(input_tensor, max_length=1024, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    try:
        instructions = generated_text.split('<INSTRUCTIONS_START>')[1].split('<INSTRUCTIONS_END>')[0]
    except IndexError:
        # If the expected tokens are not found, return the entire generated text
        instructions = generated_text

    return instructions


If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`
All model checkpoint layers were used when initializing TFBertLMHeadModel.

All the layers of TFBertLMHeadModel were initialized from the model checkpoint at /content/drive/My Drive/generated_data/bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertLMHeadModel for predictions without further training.


In [28]:
ingredients = "Green Moong Dal (Whole), Pink Masoor Dal (Split), Arhar dal (Split Toor Dal), White Urad Dal (Split), Chana dal (Bengal Gram Dal), Turmeric powder (Haldi), Oil, Mustard seeds (Rai/ Kadugu), Dry Red Chillies, Bay leaf (tej patta), Lemon juice, Coriander (Dhania) Leaves, Salt, Dry coconut (kopra), Onions, Coriander Powder (Dhania), Cumin powder (Jeera), Kashmiri dry red chillies, Cloves (Laung), Cardamom (Elaichi) Pods/Seeds, Cinnamon Stick (Dalchini), Whole Black Peppercorns, Garlic"
instructions = generate_instructions(ingredients)
print(instructions)


< recipe _ start > < input _ start > < ingredients _ start > green moong dal ( whole ), pink masoor dal ( split ), arhar dal ( split toor dal ), white urad dal ( split ), chana dal ( bengal gram dal ), turmeric powder ( haldi ), oil, mustard seeds ( rai / kadugu ), dry red chillies, bay leaf ( tej patta ), lemon juice, coriander ( dhania ) leaves, salt, dry coconut ( kopra ), onions, coriander powder ( dhania ), cumin powder ( jeera ), kashmiri dry red chillies, cloves ( laung ), cardamom ( elaichi ) pods / seeds, cinnamon stick ( dalchini ), whole black peppercorns, garlic < input _ end > dal a in spices garliceric a minutes addi. you keep keep heat heateric for of the about the into the and seeds chopped for heat. add to sa. well about add add add.. the mix garlic in in of garlic chopped. water the the allow and and and the heat heat heat heat heateric all pressure add to allow the the sa garlic., allowute sa sa garlic and sa sa a powder, onions add add cook sa a done flame the a a a