# Necessary imports for building Index

In [1]:
print("getting necessary imports, if interrupted midway restart kernel")
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import os
print("Finished imports")


getting necessary imports, if interrupted midway restart kernel
Finished imports


In [2]:
# 1. Load the Data
# Assuming the data is in a file named 'recipes_for_embeddings.jsonl' (JSON Lines format)
data = []
print(f"Loading data from 'recipes_for_embeddings.jsonl'...")
assert os.path.exists('recipes_for_embeddings.jsonl'), "Data file not found!"
with open('recipes_for_embeddings.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))
print(f"Example document: {data[0]}")

# Separating text for embedding and metadata (to retrieve later)
documents = [d['text_for_embedding'] for d in data]

  




Loading data from 'recipes_for_embeddings.jsonl'...
Example document: {'title': '"Adult" Pimiento Cheese ', 'text_for_embedding': 'Ingredients: a - jar diced pimientos, coarsely grated sharp cheddar, crackers, crudits, or large garlic cloves, to mayonnaise, toasted baguette. Recipe: "Adult" Pimiento Cheese'}


In [3]:
print("Initalizing embedding model, using SentenceTransformer...")
# 2. Initialize Embedding Model
# We use a dedicated embedding model (not Qwen itself) for vectorization
embed_model = SentenceTransformer('BAAI/bge-m3') # State-of-the-art open embedding model

# 3. Generate Embeddings
print("Generating embeddings...")
recipe_embeddings = embed_model.encode(documents, convert_to_tensor=False, show_progress_bar=True)
recipe_embeddings = np.array(recipe_embeddings).astype('float32')
print(f"Generated embeddings shape: {recipe_embeddings.shape}")


Initalizing embedding model, using SentenceTransformer...
Generating embeddings...


Batches:   0%|          | 0/570 [00:00<?, ?it/s]

Generated embeddings shape: (18222, 1024)


## Normalize the vectors and create FAISS index

In [5]:
print("Normalizing vectors")
# Normalize with respect to euclidean norm
faiss.normalize_L2(recipe_embeddings)

print("Creating Faiss index...")
# 4. Create FAISS Index

dimension = recipe_embeddings.shape[1]
# Switched from L2 to cosine similarity, for normalized vectors dot product becomes exactly cosine similarity
index = faiss.IndexFlatIP(dimension) 

index.add(recipe_embeddings)

print(f"Indexed {index.ntotal} documents with Cosine Similarity.")



Normalizing vectors
Creating Faiss index...
Indexed 18222 documents with Cosine Similarity.


In [6]:
# 5. Save the Index
faiss.write_index(index, "recipe_index.faiss")
print("Saved Faiss index to 'recipe_index.faiss'")

Saved Faiss index to 'recipe_index.faiss'
