In [22]:
!pip install -U google-genai==1.7.0
!pip install google-api-core
!pip install python-dotenv



In [23]:
import pandas as pd
from google import genai
from dotenv import load_dotenv
import os
import warnings
from tqdm import tqdm
import random
import time
from google.genai import types
import pickle
load_dotenv()

True

In [24]:
recipes_df = pd.read_csv("dataset.csv")
documents = recipes_df['name'].astype(str) + " → " + recipes_df['steps'].astype(str)
documents = documents.tolist()

In [None]:
api_key = os.getenv('KEY')
genai_client = genai.Client(api_key=api_key)

In [None]:
warnings.filterwarnings("ignore")
tqdm.pandas()


In [None]:
def generate_embedding(text: str) -> list[float]:
    result = genai_client.models.embed_content(
        model="models/text-embedding-004",
        contents=text,
        config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
    )
    return result.embeddings[0].values

In [None]:
def embedding(text, max_retries=5):
    for attempt in range(max_retries):
        try:
            return generate_embedding(text)
        except Exception as err:
            print(f"Retry {attempt + 1} for: {text[:40]}...\nError: {err}")
            time.sleep((2 ** attempt) + random.random())
    return [0.0] * 768

In [None]:
sample_docs = random.sample(documents, int(len(documents) * 0.005))

embedding_file = "saved_embeddings.pkl"

In [None]:
if os.path.exists(embedding_file):
    with open(embedding_file, "rb") as f:
        formatted_knowledge = pickle.load(f)
    print(f"[INFO] Loaded {len(formatted_knowledge)} items from disk.")
else:
    # 🧠 Generate embeddings and save to disk
    formatted_knowledge = []
    for item in tqdm(sample_docs):
        title, content = item.split("→") if "→" in item else (item, "No content")
        formatted_knowledge.append({
            "title": title.strip(),
            "body": content.strip(),
            "embedding": embedding(title.strip())
        })

    with open(embedding_file, "wb") as f:
        pickle.dump(formatted_knowledge, f)
        print(f"[INFO] Saved {len(formatted_knowledge)} embeddings to '{embedding_file}'")