In [None]:
# Install required packages
!pip install -U google-genai
!pip install faiss-cpu
!pip install google-api-core
!pip install python-dotenv



In [None]:
import pandas as pd
from google import genai
from dotenv import load_dotenv, find_dotenv
import os
import warnings
from tqdm import tqdm
import random
import time
from google.genai import types
import pickle
load_dotenv(find_dotenv())

True

In [None]:
#Load and prepare the dataset
recipes_df = pd.read_csv("../../Dataset/RAW_recipes.csv")
documents = recipes_df['name'].astype(str) + " → " + recipes_df['steps'].astype(str)
documents = documents.tolist()

In [19]:
api_key = os.getenv('KEY')
genai_client = genai.Client(api_key=api_key)

In [20]:
warnings.filterwarnings("ignore")

In [21]:
# Generating embeddings with retry logic
def embedding(text, max_retries=5):
    for attempt in range(max_retries):
        try:
            return genai_client.models.embed_content(model="models/text-embedding-004",contents=text,config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")).embeddings[0].values
        except Exception as err:
            print(f"Retry {attempt + 1} for: {text[:40]}...\nError: {err}")
            time.sleep((2 ** attempt) + random.random())
    return [0.0] * 768

In [None]:
sample_docs = random.sample(documents, int(len(documents) * 0.010))
embedding_file = "../../Dataset/saved_embeddings.pkl"

In [23]:
# Load or generate embeddings
if os.path.exists(embedding_file):
    with open(embedding_file, "rb") as f:
        formatted_knowledge = pickle.load(f)
    print(f"[INFO] Loaded {len(formatted_knowledge)} items from disk.")
else:
    formatted_knowledge = []
    for item in tqdm(sample_docs):
        title, content = item.split("→") if "→" in item else (item, "No content")
        formatted_knowledge.append({
            "title": title.strip(),
            "body": content.strip(),
            "embedding": embedding(title.strip())
        })

    with open(embedding_file, "wb") as f:
        pickle.dump(formatted_knowledge, f)
        print(f"[INFO] Saved {len(formatted_knowledge)} embeddings to '{embedding_file}'")

[INFO] Loaded 2316 items from disk.


In [24]:
#Setup FAISS index
import numpy as np
import faiss

embedding_dim = 768
faiss_index = faiss.IndexFlatL2(embedding_dim)
id_map = {}

for i, item in enumerate(formatted_knowledge):
    vector = np.array(item["embedding"], dtype=np.float32)
    faiss_index.add(np.expand_dims(vector, axis=0))
    id_map[i] = {"title": item["title"], "body": item["body"]}

print(f"[INFO] Total documents stored in FAISS: {faiss_index.ntotal}")

[INFO] Total documents stored in FAISS: 2316
