# TF-IDF

## Packages

In [1]:
import re

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Constants

In [2]:
DATASET_PATH = "../data/raw/dataset.csv"

## Dataset

### Load data

https://huggingface.co/datasets/recipe_nlg

Columns

* id (int): ID.
* title (str): Title of the recipe.
* ingredients (list of str): Ingredients.
* directions (list of str): Instruction steps.
* ner (list of str): NER food entities.

In [3]:
stripper = lambda x: x.strip("[]").replace("'","").replace('"','').split(", ")
dataset = pd.read_csv(DATASET_PATH, index_col=0, converters={"directions":stripper, "ingredients":stripper})

### Filtering

Filter out the recipes having many steps or ingredients.

In [4]:
dataset = dataset[dataset["directions"].apply(len) <= 15]
dataset = dataset[dataset["ingredients"].apply(len) <= 20]

dataset.reset_index(inplace=True, drop=True)

Filter out the recipes that contain rare ingredients. A rare ingredient is an ingredient that has been used less than 5 times among all recipes

In [5]:
vcs = dataset["ingredients"].explode().value_counts()
selected_ingredients = set(vcs[vcs >= 5].index)

In [6]:
dataset = dataset[dataset["ingredients"].explode().isin(selected_ingredients).groupby(level=0).all()]

### Preprocessing

In [7]:
# Merge multiple word ingredients
dataset["ingredients"] = dataset["ingredients"].apply(lambda ingredients: [x.replace(" ", "_").lower() for x in ingredients])
# Only words in recipe
dataset["ingredients"] = dataset["ingredients"].apply(lambda ingredients: [x for x in ingredients if re.search("[a-z]", x)])
# Drop duplicates in recipe
dataset["ingredients"] = dataset["ingredients"].apply(lambda ingredients: list(dict.fromkeys(ingredients)))

In [8]:
dataset["ingredients"]

1          [beef, chicken_breasts, cream_of_mushroom_soup...
2          [frozen_corn, cream_cheese, butter, garlic_pow...
3          [chicken, chicken_gravy, cream_of_mushroom_sou...
4          [peanut_butter, graham_cracker_crumbs, butter,...
5          [sugar, butter, egg, buttermilk, flour, salt, ...
                                 ...                        
1729357    [lean_ground_beef, green_peppers, garlic, spag...
1729358    [salmon, cheese, flour_tortilla, green_chillie...
1729359             [bread, italian_sauce, frozen_meatballs]
1729360    [eggs, paprika, salt, choice, miracle_whip, re...
1729361    [radish, sesame_oil, white_sesame_seeds, salt,...
Name: ingredients, Length: 1565119, dtype: object

### Save the processed dataset

In [9]:
processedDatasetPath = "../data/processed/dataset.csv"

In [10]:
dataset.to_csv(processedDatasetPath, index=False)

## TF-IDF Vectorizer

The sklearn vectorizer is expecting a list of strings, so we will join the ingredient list elements with a space.

In [None]:
tfidf = TfidfVectorizer(lowercase=False)

tfidf.fit(dataset["ingredients"].str.join(" "))

In [None]:
vectorizedPath = "../models/tfidf_vectorizer.pkl"
# TODO save vectorizer

In [None]:
tfidf_recipes = tfidf.transform(dataset['ingredients'].str.join(" ")).toarray()

In [None]:
tfidf_recipes

In [None]:
tfidfRecipesPath = "../data/processed/tfidf_recipes.csv"
# TODO save tfidf_recipes

### Inference 

In [None]:
idx = 1
print(dataset.iloc[idx])

inputSet = dataset["ingredients"].iloc[idx][1: ]
print(inputSet)

In [None]:
nBestMatches = 3

transformed = tfidf.transform([" ".join(inputSet)]).toarray()
similarities = similarity_f(transformed, tfidf_recipes).flatten()

sorted_idx = np.argsort(similarities)[::-1].tolist()[:nBestMatches]
bestMatches = [dataset.iloc[i] for i in sorted_idx]

In [None]:
bestMatches