# Reciped-ML: A Natural Language Processing (NLP) model for text embedding of recipes used in our recommendation system

In [None]:
%pip install pandas
%pip install numpy
%pip install nltk
%pip install gensim
%pip install scikit-learn
%pip install annoy


In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import ast
import re
import nltk
import pickle
import joblib
import statistics
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict 
from annoy import AnnoyIndex

nltk.download('wordnet')
nltk.download('punkt')

## Dataset cleanup and storage

In [None]:
def convert_str_to_list(string: str):
	string = string[1:-1]
	string = string.split(",")
	string = [x.strip() for x in string]
	return string;

# Read the CSV file into a DataFrame
recipe_df = pd.read_csv('recdata.csv', converters={'ingredients': convert_str_to_list, 'directions': convert_str_to_list, 'NER': convert_str_to_list})
recipe_df = recipe_df.sort_values(by='id', ascending=True)
recipe_df

In [None]:
# # Type casting values for appropriate use
# recipe_df['title'] = recipe_df['title'].astype(str)
# recipe_df['ingredients'] = recipe_df['ingredients'].astype(ast.literal_eval)
# recipe_df['directions'] = recipe_df['directions'].astype(ast.literal_eval)
# recipe_df['NER'] = recipe_df['NER'].astype(ast.literal_eval)

In [None]:
# Stop words found in NER column
measurement_units = ["spoon", "cup", "ounce", "gram", "handful", "pinch", "tasty"]
lemmatizer = WordNetLemmatizer()

# Cleanup function for ingredient name
def clean_ingredient_name(ingredient: str) -> str:
    if 'http' in ingredient:
        return None
    if len(ingredient) <= 1:
        return None
    ingredient = re.sub(r'[_$"\',#…\\/(){}\[\]!?0-9]', '', ingredient) # Remove special characters, symbols, and digits
    ingredient = ingredient.strip() # Trim
    ingredient = ingredient.lower() # Lowercase
    ingredient = lemmatizer.lemmatize(ingredient) # Base word
    ingredient = re.sub(r'\b(?:' + '|'.join(map(re.escape, measurement_units)) + r')\b', '', ingredient) # Remove ingredient names
    if 'and' in ingredient:
        ingredient = ingredient.split('and')
        ingredient = [ing.strip() for ing in ingredient]
        ingredient = [re.sub(r'[_$"\',#…\\/(){}\[\]!?0-9]', '', ing) for ing in ingredient]
        ingredient = [ing.lower() for ing in ingredient]
        ingredient = [lemmatizer.lemmatize(ing) for ing in ingredient]
        ingredient = [re.sub(r'\b(?:' + '|'.join(map(re.escape, measurement_units)) + r')\b', '', ing) for ing in ingredient]
    return ingredient

# Flatten the list of ingredients in the NER column
def flatten_list(lst: list) -> list:
    flattened = []
    for item in lst:
        if isinstance(item, list):
            flattened.extend(item)
        else:
            flattened.append(item)
    flattened = [ing for ing in flattened if ing is not None]
    return flattened

In [None]:
# Applying cleanup function
print(recipe_df['NER'][0])
recipe_df['NER'] = recipe_df['NER'].apply(lambda x: flatten_list([clean_ingredient_name(item) for item in x]))
recipe_df = recipe_df.dropna(subset=['NER'])
recipe_df = recipe_df.reset_index(drop=True)

In [None]:
# Genrating unique ingredient dataset
recipe_df_exploded = recipe_df.explode(column='NER')
recipe_df_exploded['NER'] = recipe_df_exploded['NER'].str.lower().str.strip()
unique_ingredients = recipe_df_exploded['NER'].unique()
ingredients_df = pd.DataFrame({'ingredient': unique_ingredients})
ingredients_df['id'] = range(1, len(ingredients_df) + 1)
ingredients_df

In [None]:
ingredients_df = ingredients_df[['id', 'ingredient']]
ingredients_df

In [None]:
# Writeing cleaned datasets to csv for storage
recipe_df.to_pickle('recipes_df.pkl')

It was noticed that there were certain ingredients that were extremely common amongst recipes, and when used in embeddings, they somewhat overpowered the more important ingredients. It was decided that these ingredients do not play much of a role in providing much distinguising power for recipes, and would not always be required by the user to be inputted. Thus, the top 15 most common ingredients were completely renmoved from the NER column.

In [None]:
def remove_common(ingredients: list) -> list:
    return [ingredient for ingredient in ingredients if ingredient not in most_common_ingredients ]

# Tallying of ingredient frequency amongst recipes
vocabulary = nltk.FreqDist()
for ingredients in recipe_df['NER']:
    vocabulary.update(ingredients)

most_common_ingredients = [word for word, freq in vocabulary.most_common(15)]
most_common_ingredients

In [None]:
# Removing the top 15 most common
recipe_df['NER'] = recipe_df["NER"].apply(remove_common)

## Model Building Process

The model makes use of a popular Neural Network (NN) called word2vec that takes in a corpus of text and maps each word in the corpus to a vector of fixed length. The model captures semantic and syntatic relationships between words with the idea that words with similar meanings tend to occur in similar contexts. For this model, the NER column represents a list of base ingredients that have been extract using other forms of ML. These lists are considered to be the documents of the corpus, as it was decided that the amount of ingredients and preparation style did not contribute significantly to the overall theme of the recipe. The word2vec model replaces each word in the corpus with a vector that is 100 elements in length. In saying that, the documents become two dimensional, which proves to be computationally expensive to compare when using techniques such as cosine similarity. For that reason, a weighted average method was adopted in the form of TFIDF embeddings, where the words are given a weight directly proportional to their term frequency in the document and inversely proportional to their frequency across documents. This embedding was used to give a weighted average of vectors in a single document, resulting in a single one dimensional vector that is 100 elements in length.

In [None]:
# Creates document embeddings from word embeddings using a weighted average
class TfidfEmbedding(object):
    def __init__(self, model_w2v: Word2Vec):

        self.model_w2v = model_w2v
        self.word_idf_weight = None
        self.vector_size = model_w2v.wv.vector_size

    # creates the idf dictionary for all words in the corpus
    def fit(self, recipes: list):
        text_docs = []
        for doc in recipes:
            text_docs.append(" ".join(doc)) # become space seperated strings
        tfidf = TfidfVectorizer()
        tfidf.fit(text_docs)  
        joblib.dump(tfidf, 'tfidf.pkl')
        max_idf = max(tfidf.idf_)   # if a word was never seen it is given idf of the max of known idf value
        self.word_idf_weight = defaultdict(
            lambda: max_idf,
            [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()],
        )
        return self

    # converts a list of recipes to a list of document vectors
    def transform(self, recipes: list): 
        doc_word_vector = self.doc_average_list(recipes)
        return doc_word_vector

    # retruns the document embedding as a weighted average
    def doc_average(self, recipe: list):
        mean = []
        for word in recipe:
            if word in self.model_w2v.wv.index_to_key:
                mean.append(
                    self.model_w2v.wv.get_vector(word) * self.word_idf_weight[word]
                ) 

        if not mean:  
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

    # returns the full list of all document embeddings
    def doc_average_list(self, recipes):
        return np.vstack([self.doc_average(recipe) for recipe in recipes])

The ingredient lists are sorted alphabetically to ensure a standard order. Since word2vec considers surrounding words for context, identical ingredients in different orders are considerd to have different context. Additionally, when generating recommendations, a simple cosine similarity check on all possible recipes is computationally expensive to repeat a number of times. This is why the ANNOY (Approximate Nearest Neighbour Oh Yeah) python library is used. This library is designed to efficiently find the approximate nearest neighbours of a query point in high dimensional space. The annoy index is a datastructure that stores randomized binary trees for efficient retrieval of nearest neighbours.

In [None]:
# Sorts the ingredient list alphabetically
def sort_corpus(recipes: pd.DataFrame) -> list:
    sorted_corpus = []
    for doc in recipes["NER"]:
        doc.sort()
        sorted_corpus.append(doc)
    print(sorted_corpus[0])
    return sorted_corpus

# Window length is taken to be the average length of ingredient lists
def get_window_length(corpus: list) -> int:
    recipe_lengths =[len(doc) for doc in corpus]
    avg_length = float(sum(recipe_lengths)/len(recipe_lengths))
    return round(avg_length)

# Creates and saves the word2vec model
def create_and_save_w2v_model(recipes: pd.DataFrame) -> Word2Vec:
    corpus = sort_corpus(recipes)
    model_w2v = Word2Vec(corpus, sg=0, workers=8, window=get_window_length(corpus), min_count=1, vector_size=100)
    model_w2v.init_sims(replace=True)
    model_w2v.save('models/model_w2v.bin')
    return model_w2v 

# Generates the full corupus embeddings for all recipes
def get_corpus_embeddings(recipes: pd.DataFrame, tfidf_vectorizer: TfidfEmbedding) -> list:
        corpus = sort_corpus(recipes)
        fitted_tfidf = tfidf_vectorizer.fit(corpus)
        recipe_embeddings = tfidf_vectorizer.transform(corpus)
        recipe_embeddings = [doc.reshape(1, -1) for doc in recipe_embeddings]
        assert len(recipe_embeddings) == len(corpus)
        return recipe_embeddings, fitted_tfidf

# Returns a list of recipe_ids relating to the top N recommendations 
def get_recommendations(input: list, fitted_tfidf: TfidfEmbedding, annoy_index: AnnoyIndex, corpus_embeddings: list, N=5, chef=False) -> list:
    # Input is a list of ingredients if chef is true , else it is a recipe_id
    if chef:
        input.sort()
        input_embedding = fitted_tfidf.transform([input])[0].reshape(1, -1)[0]
    else:
        input_embedding = corpus_embeddings[input]
        
    return annoy_index.get_nns_by_vector(input_embedding, 5)

# Creates the corpus embeddings as well as builds and saves an annoy index
def create_and_save_embeddings(model_w2v : Word2Vec, recipes : pd.DataFrame) -> (list, TfidfEmbedding, AnnoyIndex):
    tfidf_vectorizer = TfidfEmbedding(model_w2v)
    corpus_embeddings, fitted_tfidf = get_corpus_embeddings(recipes, tfidf_vectorizer)

    annoy_index = AnnoyIndex(100, 'angular') # Initialize empty index of dimension 100
    for i, embedding in enumerate(corpus_embeddings): # Add embeddings to the index
        annoy_index.add_item(i, embedding[0])
        
    annoy_index.build(n_trees=500) # Build the index to allow for effective search
    annoy_index.save('annoy_index.ann')
    return corpus_embeddings, fitted_tfidf, annoy_index


## Model Usage

In [None]:
model_w2v = create_and_save_w2v_model(recipe_df)
corpus_embeddings, fitted_tfidf, annoy_index = create_and_save_embeddings(model_w2v, recipe_df)


In [None]:
corpus_embeddings = [list(embedding[0]) for embedding in corpus_embeddings]

In [None]:
import csv

with open('embed.csv', "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(corpus_embeddings)

In [None]:
test_recipe = recipe_df.loc[100, ['title', 'NER']]
print(f"Test recipe is : {test_recipe}")
input_id = 100
recommended_ids = get_recommendations(input=input_id, fitted_tfidf=fitted_tfidf, annoy_index=annoy_index, corpus_embeddings=corpus_embeddings, chef=False)

for id in recommended_ids:
    print(f"Recommended Recipe is: {recipe_df.loc[id, ['title', 'NER']]}")

input = ["strawberry", "mango"]
print(f"Test ingredient list is {input}")
recommended_ids = get_recommendations(input=input, fitted_tfidf=fitted_tfidf, annoy_index=annoy_index, corpus_embeddings=corpus_embeddings, chef=True)

for id in recommended_ids:
    print(f"Recommended Recipe is: {recipe_df.loc[id, ['title', 'NER']]}")
