In [61]:
"""
This notebook contains:
- Lemmatization of ingredients
- Generation of a list of valid ingredients
- Finding appropriate keywords for filtering (e.g. Vegan)
"""

'\nThis notebook contains:\n- Lemmatization of ingredients\n- Generation of a list of valid ingredients\n- Finding appropriate keywords for filtering (e.g. Vegan)\n'

In [62]:
# import packages
from google.colab import drive
from google.colab import files
import pandas as pd
import numpy as np
from ast import literal_eval
import json
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
drive.mount('/content/gdrive')
df = pd.read_csv('gdrive/My Drive/McHacks2022/2022-McHacks-Team/recipe_10000.csv', converters={"RecipeIngredientParts": literal_eval})

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [64]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,index,RecipeId,Name,CookTime,PrepTime,TotalTime,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,Calories,RecipeServings,RecipeYield,RecipeInstructions,reviews_in_dict
0,0,0,38,Low-Fat Berry Blue Frozen Dessert,PT24H,PT45M,PT24H45M,Make and share this Low-Fat Berry Blue Frozen ...,['https://img.sndimg.com/food/image/upload/w_5...,Frozen Desserts,"['Dessert', 'Low Protein', 'Low Cholesterol', ...","['4', '1/4', '1', '1']","[blueberries, granulated sugar, vanilla yogurt...",4.5,170.9,4.0,,"['Toss 2 cups berries with sugar.', 'Let stand...","{579149: {'Rating': 5, 'Review': 'Yummy, yummy..."
1,1,4,39,Biryani,PT25M,PT4H,PT4H25M,Make and share this Biryani recipe from Food.com.,['https://img.sndimg.com/food/image/upload/w_5...,Chicken Breast,"['Chicken Thigh & Leg', 'Chicken', 'Poultry', ...","['1', '4', '2', '2', '8', '1/4', '8', '1/2', '...","[saffron, milk, hot green chili peppers, onion...",3.0,1110.7,6.0,,['Soak saffron in warm milk for 5 minutes and ...,"{361851: {'Rating': 3, 'Review': 'I have an In..."
2,2,5,40,Best Lemonade,PT5M,PT30M,PT35M,This is from one of my first Good House Keepi...,['https://img.sndimg.com/food/image/upload/w_5...,Beverages,"['Low Protein', 'Low Cholesterol', 'Healthy', ...","['1 1/2', '1', '1 1/2', '3/4']","[sugar, lemons, rind of, lemon, zest of, fresh...",4.5,311.1,4.0,,"['Into a 1 quart Jar with tight fitting lid, p...","{34854: {'Rating': 5, 'Review': ""My favourite ..."
3,3,14,41,Carina's Tofu-Vegetable Kebabs,PT20M,PT24H,PT24H20M,This dish is best prepared a day in advance to...,['https://img.sndimg.com/food/image/upload/w_5...,Soy/Tofu,"['Beans', 'Vegetable', 'Low Cholesterol', 'Wee...","['12', '1', '2', '1', '10', '1', '3', '2', '2'...","[extra firm tofu, eggplant, zucchini, mushroom...",4.5,536.1,2.0,4 kebabs,"['Drain the tofu, carefully squeezing out exce...","{48870: {'Rating': 5, 'Review': 'I thought thi..."
4,4,16,42,Cabbage Soup,PT30M,PT20M,PT50M,Make and share this Cabbage Soup recipe from F...,['https://img.sndimg.com/food/image/upload/w_5...,Vegetable,"['Low Protein', 'Vegan', 'Low Cholesterol', 'H...","['46', '4', '1', '2', '1']","[plain tomato juice, cabbage, onion, carrots, ...",4.5,103.6,4.0,,['Mix everything together and bring to a boil....,"{46368: {'Rating': 5, 'Review': 'This is a ver..."


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  10000 non-null  int64  
 1   index                       10000 non-null  int64  
 2   RecipeId                    10000 non-null  int64  
 3   Name                        10000 non-null  object 
 4   CookTime                    10000 non-null  object 
 5   PrepTime                    10000 non-null  object 
 6   TotalTime                   10000 non-null  object 
 7   Description                 10000 non-null  object 
 8   Images                      10000 non-null  object 
 9   RecipeCategory              10000 non-null  object 
 10  Keywords                    10000 non-null  object 
 11  RecipeIngredientQuantities  10000 non-null  object 
 12  RecipeIngredientParts       10000 non-null  object 
 13  AggregatedRating            1000

In [66]:
# If nltk is not yet installed
# !pip install nltk

In [67]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

In [68]:
# Before lemmatizing
df["RecipeIngredientParts"]

0       [blueberries, granulated sugar, vanilla yogurt...
1       [saffron, milk, hot green chili peppers, onion...
2       [sugar, lemons, rind of, lemon, zest of, fresh...
3       [extra firm tofu, eggplant, zucchini, mushroom...
4       [plain tomato juice, cabbage, onion, carrots, ...
                              ...                        
9995    [maraschino cherries, butter, margarine, corn ...
9996    [onion, parsley, celery & leaves, chickpeas, b...
9997    [brown sugar, olive oil, salt, black pepper, s...
9998    [eggs, cheddar cheese, salt, cracked black pep...
9999    [butter, onion, celery, garlic powder, potatoe...
Name: RecipeIngredientParts, Length: 10000, dtype: object

In [69]:
# After lemmatizing
df["RecipeIngredientParts"].apply(lemmatize_text)

0       [blueberry, granulated sugar, vanilla yogurt, ...
1       [saffron, milk, hot green chili peppers, onion...
2       [sugar, lemons, rind of, lemon, zest of, fresh...
3       [extra firm tofu, eggplant, zucchini, mushroom...
4       [plain tomato juice, cabbage, onion, carrot, c...
                              ...                        
9995    [maraschino cherries, butter, margarine, corn ...
9996    [onion, parsley, celery & leaves, chickpea, bl...
9997    [brown sugar, olive oil, salt, black pepper, s...
9998    [egg, cheddar cheese, salt, cracked black pepp...
9999    [butter, onion, celery, garlic powder, potato,...
Name: RecipeIngredientParts, Length: 10000, dtype: object

In [70]:
df["RecipeIngredientParts"]

0       [blueberries, granulated sugar, vanilla yogurt...
1       [saffron, milk, hot green chili peppers, onion...
2       [sugar, lemons, rind of, lemon, zest of, fresh...
3       [extra firm tofu, eggplant, zucchini, mushroom...
4       [plain tomato juice, cabbage, onion, carrots, ...
                              ...                        
9995    [maraschino cherries, butter, margarine, corn ...
9996    [onion, parsley, celery & leaves, chickpeas, b...
9997    [brown sugar, olive oil, salt, black pepper, s...
9998    [eggs, cheddar cheese, salt, cracked black pep...
9999    [butter, onion, celery, garlic powder, potatoe...
Name: RecipeIngredientParts, Length: 10000, dtype: object

In [71]:
# Create a list of all valid ingredients
all_valid_ingredients = []
for index, row in df.iterrows():
  recipe_ingredients = row["RecipeIngredientParts"]
  # recipe_ingredients = literal_eval(row["RecipeIngredientParts"])
  if recipe_ingredients != []:
    for ingredient in recipe_ingredients:
      if not ingredient in all_valid_ingredients:
        all_valid_ingredients.append(ingredient)
print(all_valid_ingredients)
print("Number of ingredients before accounting for capitalization:", len(all_valid_ingredients))
print("Number of unique ingredients before accounting for capitalization:", len(set(all_valid_ingredients)))  # Check if all elements are unique

# Removing duplicates due to capitalization
all_valid_ingredients = [x.lower() for x in all_valid_ingredients]
print("Number of unique ingredients after accounting for capitalization:", len(set(all_valid_ingredients)))
all_valid_ingredients = set(all_valid_ingredients)
all_valid_ingredients = list(all_valid_ingredients)
# print(all_valid_ingredients)

['blueberries', 'granulated sugar', 'vanilla yogurt', 'lemon juice', 'saffron', 'milk', 'hot green chili peppers', 'onions', 'garlic', 'clove', 'peppercorns', 'cardamom seed', 'cumin seed', 'poppy seed', 'mace', 'cilantro', 'mint leaf', 'fresh lemon juice', 'plain yogurt', 'boneless chicken', 'salt', 'ghee', 'onion', 'tomatoes', 'basmati rice', 'long-grain rice', 'raisins', 'cashews', 'eggs', 'sugar', 'lemons, rind of', 'lemon, zest of', 'fresh water', 'extra firm tofu', 'eggplant', 'zucchini', 'mushrooms', 'soy sauce', 'low sodium soy sauce', 'olive oil', 'maple syrup', 'honey', 'red wine vinegar', 'garlic cloves', 'mustard powder', 'black pepper', 'plain tomato juice', 'cabbage', 'carrots', 'celery', 'chicken', 'butter', 'flour', 'button mushrooms', 'green pepper', 'canned pimiento', 'Worcestershire sauce', 'parsley', 'margarine', 'egg', 'buttermilk', 'graham cracker crumbs', 'brown sugar', 'vanilla extract', 'pecan halves', 'fresh mushrooms', 'boneless skinless chicken breast halves

In [72]:
# Output in json format
jsonStr = json.dumps(all_valid_ingredients)
print(jsonStr)

with open('all_valid_ingredients_10000.json', 'w') as f:
    json.dump(jsonStr, f)

files.download('all_valid_ingredients_10000.json')

["frozen green beans", "long grain rice", "top round steak", "light ricotta cheese", "blue cheese", "unsalted peanuts", "green lentil", "reduced-sodium chicken broth", "small potato", "mexican-style stewed tomatoes with jalapeno peppers", "rice", "haddock fillet", "marshmallow cream", "smoked sausage", "fat free sour cream", "cod fish fillets", "red seedless grapes", "seedless watermelon", "green mango", "non-alcoholic beer", "chicken breasts", "low-fat buttermilk", "non-fat powdered milk", "creamed honey", "roast beef", "french baguette", "eggplants", "shallot", "madras curry powder", "ginger cube", "lamb breast", "red chili sauce  (to be used with traditional tamales)", "sage", "cold water", "white wine", "wild rice", "instant vanilla flavor pudding and pie filling", "lox", "dried whole dill weed", "lemon pepper seasoning", "palm vinegar", "lime rind", "lettuce leaf", "focaccia bread", "thai fish sauce", "ground mustard", "beef eye round", "sweet bell peppers", "navel oranges", "pean

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### How to improve the search (beyond exact match)

In [73]:
# Filter - Vegan: Searching what word would work to filter out Vegan and or Vegetarian options 
count = 0
for index, row in df.iterrows():
  keywords = literal_eval(row["Keywords"])
  combined = '\t'.join(keywords)
  if "Vegan" in combined: # "Veg" won't work as there are ones like ["Vegetable", "Meat"]
    print(keywords)
    count += 1
print(count)

['Low Protein', 'Vegan', 'Low Cholesterol', 'Healthy', 'Winter', '< 60 Mins', 'Easy']
['Fruit', 'Nuts', 'Lactose Free', 'Vegan', 'Low Cholesterol', 'Egg Free', 'Free Of...', 'Weeknight', 'Refrigerator', 'Easy']
['Potato', 'Vegetable', 'Low Protein', 'Vegan', 'Low Cholesterol', 'Healthy', 'Spicy', '< 60 Mins', 'Oven']
['Breads', 'Breakfast', 'Lactose Free', 'Vegan', 'Low Cholesterol', 'Egg Free', 'Healthy', 'Kosher', 'Free Of...', 'Weeknight', 'Oven', '< 4 Hours']
['Peppers', 'Vegetable', 'Tex Mex', 'Southwestern U.S.', 'Lactose Free', 'Vegan', 'Egg Free', 'Kosher', 'Free Of...', '< 60 Mins', 'Canning', 'Stove Top']
['European', 'Vegan', 'Low Cholesterol', 'Healthy', '< 60 Mins', 'Easy']
['Mexican', 'Vegan', 'Spring', 'Summer', 'Weeknight', 'No Cook', 'Easy']
['Moroccan', 'African', 'Vegan', '< 30 Mins']
['Rice', 'Vegetable', 'Vegan', 'Microwave', '< 30 Mins']
['Vegetable', 'European', 'Vegan', 'Winter', 'Weeknight', 'Brunch', '< 60 Mins']
['Asian', 'Low Protein', 'Vegan', 'Low Choleste