In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_and_sample_dataset(input_csv, sample_ratio=0.01):
    # Load dataset
    df = pd.read_csv(input_csv)
    # Sample 1% of the dataset randomly
    sample_df = df.sample(frac=sample_ratio)
    # Save the sampled dataset to a new CSV
    sample_csv = "sample_set.csv"
    sample_df.to_csv(sample_csv, index=False)
    return sample_csv

In [5]:
input_csv = "RecipeNLG_dataset.csv"  # Change this to your dataset's filename
sample_ratio = 0.01  # 1% of the data
sample_csv = load_and_sample_dataset(input_csv, sample_ratio)

In [15]:
def embed_data_with_sentence_transformers(csv_file, model_name, file_column):
    # Load the sample dataset
    df = pd.read_csv(csv_file)
    # Assuming the text to be embedded is in the 'text' column
    texts = df[file_column].tolist()
    # Load the model
    model = SentenceTransformer(model_name)
    #Put the model on MPS
    model = model.to(torch.device("mps"))
    # Generate embeddings
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings


def save_embeddings_to_csv(embeddings, output_file):
    # Convert the embeddings into a DataFrame
    embeddings_df = pd.DataFrame(embeddings)
    # Save the DataFrame to a CSV file
    embeddings_df.to_csv(output_file, index=False)
    print(f"Embeddings saved to {output_file}")


In [16]:
ingredients_embedings = embed_data_with_sentence_transformers(csv_file = "sample_set.csv", model_name='all-MiniLM-L6-v2', file_column ='ingredients')
save_embeddings_to_csv(embeddings = ingredients_embedings, output_file = "IngredientsEmbeddings.csv")

Batches: 100%|██████████| 698/698 [01:15<00:00,  9.22it/s]


Embeddings saved to IngredientsEmbeddings.csv


In [17]:
directions_embedings = embed_data_with_sentence_transformers(csv_file = "sample_set.csv", model_name='all-MiniLM-L6-v2', file_column ='directions')
save_embeddings_to_csv(embeddings = directions_embedings, output_file = "DirectionsEmbeddings.csv")

Batches: 100%|██████████| 698/698 [01:09<00:00, 10.04it/s]


Embeddings saved to DirectionsEmbeddings.csv


In [18]:
def create_faiss_index(embeddings, column_name):
    # Dimension of the vectors
    d = embeddings.shape[1]
    # Creating a FAISS index
    # Here we use IndexFlatL2 which is good for general purposes. 
    # You might want to explore other indices depending on your specific needs.
    index = faiss.IndexFlatL2(d)
    # FAISS expects vectors in float32
    embeddings = embeddings.astype('float32')
    # Adding vectors to the index
    index.add(embeddings)
    # Save the index to a file
    index_file_path = str(column_name) + ".faiss"
    faiss.write_index(index, index_file_path)

In [19]:
create_faiss_index(ingredients_embedings, 'ingredients')

In [20]:
create_faiss_index(directions_embedings, 'directions')

In [27]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

class EmbeddingSearcher:
    def __init__(self, model_name, index_file, sample_df):
        # Load the Sentence Transformer model
        self.model = SentenceTransformer(model_name)
        # Load the FAISS index
        self.index = faiss.read_index(index_file)
        # Store the sample DataFrame
        self.sample_df = sample_df
    
    def embed_and_search(self, text, top_n=5):
        # Embed the text
        query_embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=True).cpu().numpy()
        
        # Search the index
        distances, indices = self.index.search(query_embedding, top_n)
        
        # Print the full rows for the indices found
        for idx in indices[0]:
            # Use iloc to access the DataFrame row and print it
            print(self.sample_df.iloc[idx])

        return indices[0], distances[0]






In [28]:
# Assuming 'sample_set.csv' is the CSV file containing your sample set
sample_df = pd.read_csv('sample_set.csv')
# Example usage:
# Initialize the searcher with the model, index, and sample DataFrame
searcher = EmbeddingSearcher(model_name='all-MiniLM-L6-v2', index_file='ingredients.faiss', sample_df=sample_df)

# The text you want to search for similar items
query_text = "Give me a recipe I can make with potatos, cumin, chicken thigh, bell peppers, and red wine."

# Perform the embedding and search, which now also prints the relevant rows
indices, distances = searcher.embed_and_search(query_text, top_n=5)


print("Indices of top N most similar items:", indices)
print("Corresponding distances:", distances)

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.51it/s]

Unnamed: 0                                                 83018
title                                   Red Potato And Egg Salad
ingredients    ["6 red potatoes, boiled and cut in half", "3 ...
directions     ["Combine the potatoes, eggs, scallions and ca...
link             www.cookbooks.com/Recipe-Details.aspx?id=491476
source                                                  Gathered
NER            ["red potatoes", "eggs", "scallion", "carrot",...
Name: 15358, dtype: object
Unnamed: 0                                               2061185
title                                Parmesan Au Gratin Potatoes
ingredients    ["5 large un peeled red potatoes, cooked in be...
directions     ["Remove cooled potatos from broth, peel and c...
link           cookpad.com/us/recipes/340549-parmesan-au-grat...
source                                                 Recipes1M
NER            ["red potatoes", "flour", "butter", "chicken s...
Name: 11171, dtype: object
Unnamed: 0                          




In [26]:
# Assuming 'sample_set.csv' is the CSV file containing your sample set
sample_df = pd.read_csv('sample_set.csv')
# Example usage:
# Initialize the searcher with the model and index you have prepared
searcher = EmbeddingSearcher(model_name='all-MiniLM-L6-v2', index_file='ingredients.faiss')

# The text you want to search for similar items
query_text = "Give me a recipe I can make with potatos, cumin, chicken thigh, bell peppers, and red wine."

# Perform the embedding and search
indices, distances = searcher.embed_and_search(query_text, top_n=5)

print("Indices of top N most similar items:", indices)
print("Corresponding distances:", distances)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]

Indices of top N most similar items: [15358 11171  4930  3161 18025]
Corresponding distances: [0.5327559  0.54329157 0.5460544  0.5489533  0.5581025 ]





In [1]:

def load_and_sample_dataset(input_csv, sample_ratio=0.01):
    # Load dataset
    df = pd.read_csv(input_csv)
    # Sample 1% of the dataset randomly
    sample_df = df.sample(frac=sample_ratio)
    # Save the sampled dataset to a new CSV
    sample_csv = "sample_set.csv"
    sample_df.to_csv(sample_csv, index=False)
    return sample_csv

def embed_data_with_sentence_transformers(csv_file, model_name='all-MiniLM-L6-v2'):
    # Load the sample dataset
    df = pd.read_csv(csv_file)
    # Assuming the text to be embedded is in the 'text' column
    texts = df['text'].tolist()
    # Load the model
    model = SentenceTransformer(model_name)
    # Generate embeddings
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings

def create_faiss_index(embeddings):
    # Dimension of the vectors
    d = embeddings.shape[1]
    # Creating a FAISS index
    # Here we use IndexFlatL2 which is good for general purposes. 
    # You might want to explore other indices depending on your specific needs.
    index = faiss.IndexFlatL2(d)
    # FAISS expects vectors in float32
    embeddings = embeddings.astype('float32')
    # Adding vectors to the index
    index.add(embeddings)
    # Save the index
    faiss.write_index(index, "faiss_index.idx")


[34mRecipeApp[m[m/     [34mRecipeAppVenv[m[m/


In [None]:
def main():
    input_csv = "your_dataset.csv"  # Change this to your dataset's filename
    sample_ratio = 0.01  # 1% of the data
    sample_csv = load_and_sample_dataset(input_csv, sample_ratio)
    embeddings = embed_data_with_sentence_transformers(sample_csv)
    create_faiss_index(embeddings)
    print("Sample set created, embeddings generated, and FAISS index created.")

if __name__ == "__main__":
    main()
