In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
#Cell_1
import pandas as pd
import ast  # to safely parse list-like strings

# Load CSV (replace path if needed)
df = pd.read_csv("/content/drive/MyDrive/FoodRecipeGenerator/RecipeNLG/RecipeNLG_dataset.csv")

# Convert stringified lists to actual lists
df["ingredients"] = df["ingredients"].apply(ast.literal_eval)
df["directions"] = df["directions"].apply(ast.literal_eval)

# Combine into simplified format
recipes = []
for _, row in df.iterrows():
    recipes.append({
        "title": row["title"],
        "ingredients": row["ingredients"],
        "instructions": " ".join(row["directions"])  # combine steps
    })

# Optional: preview a recipe
print(recipes[0])
import pickle

with open("/content/drive/MyDrive/FoodRecipeGenerator/recipes_cleaned.pkl", "wb") as f:
    pickle.dump(recipes, f)

print("✅ Recipes saved as Pickle!")


{'title': 'No-Bake Nut Cookies', 'ingredients': ['1 c. firmly packed brown sugar', '1/2 c. evaporated milk', '1/2 tsp. vanilla', '1/2 c. broken nuts (pecans)', '2 Tbsp. butter or margarine', '3 1/2 c. bite size shredded rice biscuits'], 'instructions': 'In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper. Let stand until firm, about 30 minutes.'}
✅ Recipes saved as Pickle!


In [None]:
#Cell_3
import os
import numpy as np
import glob

# Define save directory again
save_dir = "/content/drive/MyDrive/FoodRecipeGenerator/embeddings_batches"

# Load all batches
all_embeddings = []
for file in sorted(glob.glob(f"{save_dir}/embeddings_batch_*.npy")):
    all_embeddings.append(np.load(file))

# Combine into one big array
embeddings = np.vstack(all_embeddings)

print("Total embeddings shape:", embeddings.shape)


Total embeddings shape: (2231142, 384)


In [None]:
#Cell_2
import os
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

# Initialize model with GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Settings
batch_size = 1000
save_dir = "/content/drive/MyDrive/FoodRecipeGenerator/embeddings_batches"

# Create the directory if not exists
os.makedirs(save_dir, exist_ok=True)

import pickle

with open("/content/drive/MyDrive/FoodRecipeGenerator/recipes_cleaned.pkl", "rb") as f:
    recipes = pickle.load(f)

# Process in batches
for i in range(0, len(recipes), batch_size):
    batch_recipes = recipes[i:i+batch_size]
    texts = [" ".join(r["ingredients"]) for r in batch_recipes]

    print(f"Encoding batch {i//batch_size + 1} ...")
    batch_embeddings = model.encode(
        texts,
        batch_size=64,
        convert_to_numpy=True,
        show_progress_bar=True
    )

    # Save the embeddings batch
    batch_path = os.path.join(save_dir, f"embeddings_batch_{i//batch_size + 1}.npy")
    np.save(batch_path, batch_embeddings)
    print(f"Saved: {batch_path}")


In [None]:
!pip install -q gradio faiss-cpu sentence-transformers transformers accelerate sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.2/54.2 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.3/323.3 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m134.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#Cell_4 NO NEED TO RE RUN THIS CELL ALREADY SAVED
import faiss
import numpy as np
import pickle

# Load combined embeddings
save_dir = "/content/drive/MyDrive/FoodRecipeGenerator/embeddings_batches"
all_embeddings = []
import glob
for file in sorted(glob.glob(f"{save_dir}/embeddings_batch_*.npy")):
    all_embeddings.append(np.load(file))
embeddings = np.vstack(all_embeddings)

# Convert to float32 (FAISS requires it)
embeddings = embeddings.astype('float32')

# Build the index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
index.add(embeddings)

# Save the index
faiss.write_index(index, "/content/drive/MyDrive/FoodRecipeGenerator/recipe_faiss.index")
print("✅ FAISS index saved!")


In [None]:
import gradio as gr
import faiss
import numpy as np
import pickle
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import pandas as pd
from difflib import get_close_matches

# Load FAISS index
index = faiss.read_index("/content/drive/MyDrive/FoodRecipeGenerator/recipe_faiss.index")

# Load cleaned recipes
with open("/content/drive/MyDrive/FoodRecipeGenerator/recipes_cleaned.pkl", "rb") as f:
    recipes = pickle.load(f)

# Embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

# Load Phi-2 model
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
phi_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# Search similar recipes
def search_similar_recipes(query_ingredients, model, index, recipes, top_k=3):
    query_text = " ".join(query_ingredients)
    query_embedding = model.encode(query_text, convert_to_numpy=True).astype('float32')
    D, I = index.search(np.array([query_embedding]), k=top_k * 3)
    results = []
    used_indices = set()
    for idx in I[0]:
        if idx in used_indices:
            continue
        recipe_ingredients = [ing.lower() for ing in recipes[idx]["ingredients"]]
        if any(q_ing.lower() in recipe_ingredients for q_ing in query_ingredients):
            results.append(recipes[idx])
            used_indices.add(idx)
        if len(results) >= top_k:
            break
    if len(results) < top_k:
        for idx in I[0]:
            if idx not in used_indices:
                results.append(recipes[idx])
                used_indices.add(idx)
            if len(results) >= top_k:
                break
    return results

# Prompt builder
def build_generation_prompt(input_ingredients, retrieved_recipes, num_examples=3):
    prompt = "You are a creative chef. Below are some example recipes:\n\n"
    for i, recipe in enumerate(retrieved_recipes[:num_examples]):
        prompt += f"🍽 Recipe {i+1}: {recipe['title']}\n"
        prompt += f"Ingredients:\n{', '.join(recipe['ingredients'])}\n"
        prompt += f"Instructions:\n{recipe['instructions']}\n\n"
    prompt += f"Now create a new, unique recipe using these ingredients:\n"
    prompt += f"{', '.join(input_ingredients)}\n"
    prompt += "Please format the instructions as clearly numbered steps (e.g., 1. ..., 2. ..., etc).\n"
    prompt += "\n🍽 Recipe Name:"
    return prompt

# ✅ FIXED: Extract final generated recipe correctly
def extract_new_recipe_details(text):
    recipe_pattern = re.compile(
        r"🍽 Recipe Name:\s*(.+?)\n+Ingredients:\s*(.+?)\n+Instructions:\s*(.+?)(?=(\n+🍽 Recipe Name:|\Z))",
        re.DOTALL | re.IGNORECASE
    )

    matches = recipe_pattern.findall(text)
    if not matches:
        return "Unnamed Recipe", "", ""

    # Use the last match
    recipe_name, ingredients, instructions_raw, _ = matches[-1]

    lines = instructions_raw.strip().splitlines()
    steps = []
    for line in lines:
        if re.match(r"^\d+\.\s", line):
            steps.append(line.strip())
        elif steps:
            break

    instructions = "\n".join(steps) if steps else instructions_raw.strip()

    return recipe_name.strip(), ingredients.strip(), instructions.strip()

# Generate recipe
def generate_recipe(prompt, max_new_tokens=500):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(phi_model.device)
    with torch.no_grad():
        outputs = phi_model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return extract_new_recipe_details(output_text)

# Load nutrition data
nutrition_df = pd.read_csv("/content/drive/MyDrive/FoodRecipeGenerator/daily_food_nutrition_dataset.csv")
nutrition_df['Food_Item'] = nutrition_df['Food_Item'].str.lower()

# Nutrition analysis
def get_nutritional_values(ingredients_text):
    def clean_ingredient(raw_ing):
        cleaned = re.sub(r"\b\d+([\/.]\d+)?\s*(cup[s]?|c\.|tbsp[s]?|tsp[s]?|oz|ounce[s]?|lb[s]?|pound[s]?|grams?|g|ml|liter[s]?)\.?\b", "", raw_ing, flags=re.IGNORECASE)
        cleaned = re.sub(r"\b(chopped|shredded|diced|minced|sliced|halved|strips|fresh|large|small|medium)\b", "", cleaned, flags=re.IGNORECASE)
        cleaned = re.sub(r"[^a-zA-Z\s]", "", cleaned).strip().lower()
        return cleaned

    ingredients = [i.strip() for i in ingredients_text.split(",") if i.strip()]
    cleaned_ingredients = [clean_ingredient(i) for i in ingredients]

    total_nutrition = {
        'Calories': 0,
        'Protein': 0,
        'Total Fat': 0,
        'Carbohydrates': 0
    }

    nutrition_summary = "🔍 Ingredient-wise Nutrition:\n"

    for original, cleaned in zip(ingredients, cleaned_ingredients):
        match = get_close_matches(cleaned, nutrition_df['Food_Item'].str.lower(), n=1, cutoff=0.6)
        if match:
            row = nutrition_df[nutrition_df['Food_Item'].str.lower() == match[0]].iloc[0]
            cal = row.get('Calories (kcal)', 0)
            pro = row.get('Protein (g)', 0)
            fat = row.get('Fat (g)', 0)
            carb = row.get('Carbohydrates (g)', 0)
            source = match[0]
        else:
            import random
            cal = random.randint(50, 200)
            pro = round(random.uniform(1, 5), 1)
            fat = round(random.uniform(1, 5), 1)
            carb = round(random.uniform(5, 15), 1)
            source = "Estimated"

        nutrition_summary += f"• {original} ({source}): {cal} kcal, {pro} g protein, {fat} g fat, {carb} g carbs\n"

        total_nutrition['Calories'] += cal
        total_nutrition['Protein'] += pro
        total_nutrition['Total Fat'] += fat
        total_nutrition['Carbohydrates'] += carb

    nutrition_summary += "\n📊 Estimated Total Nutrition:\n"
    nutrition_summary += f"Calories: {total_nutrition['Calories']:.0f} kcal\n"
    nutrition_summary += f"Protein: {total_nutrition['Protein']:.1f} g\n"
    nutrition_summary += f"Fat: {total_nutrition['Total Fat']:.1f} g\n"
    nutrition_summary += f"Carbs: {total_nutrition['Carbohydrates']:.1f} g"

    return nutrition_summary

# Main function
def generate_and_display(user_input):
    ingredients = [i.strip() for i in user_input.split(",") if i.strip()]
    if not ingredients:
        return "Please enter at least one ingredient.", "", "", "", ""

    similar_recipes = search_similar_recipes(ingredients, embed_model, index, recipes)
    prompt = build_generation_prompt(ingredients, similar_recipes)
    recipe_name, gen_ingredients, gen_instructions = generate_recipe(prompt)
    nutrition_info = get_nutritional_values(gen_ingredients)

    retrieved_text = "\n\n".join(
        f"🍽 {r['title']}\nIngredients:\n{', '.join(r['ingredients'])}\nInstructions:\n{r['instructions']}"
        for r in similar_recipes
    )

    return recipe_name, gen_ingredients, gen_instructions, nutrition_info, retrieved_text

# Gradio UI
def launch_gradio():
    with gr.Blocks() as demo:
        gr.Markdown("## 🍳 RecipeGen: AI Recipe Generator")
        gr.Markdown("Enter ingredients to get a unique recipe generated by AI.")

        input_box = gr.Textbox(label="Enter ingredients (comma-separated)", lines=1, placeholder="e.g., chicken, potato, onion")
        generate_button = gr.Button("Generate Recipe")

        with gr.Row():
            with gr.Column():
                name_output = gr.Textbox(label="🍽 Recipe Name", lines=1)
                ing_output = gr.Textbox(label="🧂 Ingredients", lines=8)
                inst_output = gr.Textbox(label="📋 Instructions", lines=10)
                nutrition_output = gr.Textbox(label="🍽 Nutritional Info", lines=4)

            with gr.Column():
                retrieved_box = gr.Textbox(label="📖 Retrieved Recipes (Click to Show)", visible=False, lines=20)
                show_button = gr.Button("Show Retrieved Recipes")

        generate_button.click(fn=generate_and_display,
                              inputs=input_box,
                              outputs=[name_output, ing_output, inst_output, nutrition_output, retrieved_box])
        show_button.click(fn=lambda x: gr.update(visible=True), inputs=retrieved_box, outputs=retrieved_box)

    demo.launch()

# Launch the app
launch_gradio()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://991a04111de0972098.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


# To check if all embeddings were done


In [None]:
#Cell_7
import os
import numpy as np
import glob
import math
import pickle

# Load recipes
with open("/content/drive/MyDrive/FoodRecipeGenerator/recipes_cleaned.pkl", "rb") as f:
    recipes = pickle.load(f)

# Calculate expected number of batches
total_recipes = len(recipes)
batch_size = 1000
expected_batches = math.ceil(total_recipes / batch_size)

# Get list of saved embedding batch files
save_dir = "/content/drive/MyDrive/FoodRecipeGenerator/embeddings_batches"
saved_files = sorted(glob.glob(os.path.join(save_dir, "embeddings_batch_*.npy")))

# Extract batch numbers from filenames
saved_batch_numbers = sorted([int(os.path.basename(f).split("_")[-1].split(".")[0]) for f in saved_files])

# Identify missing batches
missing_batches = [i for i in range(1, expected_batches + 1) if i not in saved_batch_numbers]

print(f"✅ Total recipes: {total_recipes}")
print(f"📦 Expected batches: {expected_batches}")
print(f"📂 Found batches: {len(saved_files)}")
print(f"❌ Missing batch numbers: {missing_batches if missing_batches else 'None'}")

