In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Scrape the recipe website:
Using BeautifulSoup we can easily parse through multiple html pages without needing to create additional files or classes. The function below request the html of the AllRecipes.com cuisine page and finds all cuisine names and their URLs. Then, it loops through each individual cuisine page and each recipe in the cuisine pages to aquire recipes (name, cuisine, url, and ingredients).

In [3]:
def scrape_allRecipes_cuisines():
  """
  Scrapes recipe data from Allrecipes.com based on different cuisines.

  Returns:
  pandas.DataFrame: DataFrame containing the scraped recipe information including Name, URL, Cuisine, and Ingredients.
  """

  # URL of the page listing all cuisines on Allrecipes.com
  url = 'https://www.allrecipes.com/cuisine-a-z-6740455'
  result = requests.get(url)
  doc = BeautifulSoup(result.text, "html.parser")
  cuisines = doc.select('ul.loc.mntl-link-list a')

  # Send a GET request to the page listing all cuisines
  cuisine_dict = {}
  for link in cuisines:
      cuisine = link.get_text(strip=True)
      url = link['href']
      cuisine_dict[cuisine] = url

  # Parse the HTML content of the page
  df = pd.DataFrame(list(cuisine_dict.items()), columns=['Cuisine', 'URL'])

  # Create an empty list to store recipe information
  recipes_data = []

  # Iterate over rows in the cuisine DataFrame
  for index, row in df.iterrows():
      cuisine_url = row['URL']
      cuisine = df['Cuisine'][index]
      result = requests.get(cuisine_url)
      doc = BeautifulSoup(result.text, 'html.parser')

      # {'class': 'comp mntl-card-list-items mntl-document-card mntl-card card--image-top card card--no-image'}
      # Extract information for each recipe
      recipe_info1 = doc.find_all('a', {'class': 'comp mntl-card-list-items mntl-document-card mntl-card card card--no-image'})
      recipe_info2 = doc.find_all('a', {'class': 'comp mntl-card-list-items mntl-document-card mntl-card card--image-top card card--no-image'})
      recipe_info = recipe_info1 + recipe_info2

      # Iterate over each recipe and extract relevant information
      for recipe_card in recipe_info:
          name = recipe_card.find('span', {'class': 'card__title-text'}).text.strip()
          url = recipe_card['href']

          # Extract information from individual recipe URLs
          if not pd.isna(url):
              result2 = requests.get(url)
              doc2 = BeautifulSoup(result2.text, 'html.parser')

              # Create a list to store ingredients
              ingredients_list = []
              ingredients_container = doc2.find('div', {'class': 'mntl-lrs-ingredients'})

              # Check if the container is found
              if ingredients_container:
                  # Find the list of ingredients
                  ingredients_list_element = ingredients_container.find('ul', {'class': 'mntl-structured-ingredients__list'})

                  # Check if the list of ingredients is found
                  if ingredients_list_element:
                      # Extract and append each ingredient to the list
                      for ingredient_item in ingredients_list_element.find_all('li', {'class': 'mntl-structured-ingredients__list-item'}):
                          ingredient = ingredient_item.find('span', {'data-ingredient-name': 'true'})
                          quantity = ingredient_item.find('span', {'data-ingredient-quantity': 'true'})
                          unit = ingredient_item.find('span', {'data-ingredient-unit': 'true'})

                          if ingredient and quantity and unit:
                              ingredient_text = f"{quantity.text.strip()} {unit.text.strip()} {ingredient.text.strip()}"
                              ingredients_list.append(ingredient_text)

              # Append recipe information to the list
              recipes_data.append({
                  'Name': name,
                  'URL': url,
                  'Cuisine': cuisine,
                  'Ingredients': ingredients_list,
              })

  # Create a DataFrame from the list of recipes
  recipes_df = pd.DataFrame(recipes_data)
  return recipes_df

In [4]:
recipes_df = scrape_allRecipes_cuisines()
recipes_df

Unnamed: 0,Name,URL,Cuisine,Ingredients
0,Best Vinegar Coleslaw,https://www.allrecipes.com/recipe/59318/amish-...,Amish and Mennonite,"[1 large head cabbage, cored and finely shredd..."
1,Pennsylvania-Dutch Pickled Beets and Eggs,https://www.allrecipes.com/recipe/13743/pennsy...,Amish and Mennonite,"[8 large eggs, 2 (15-ounce) cans whole pickled..."
2,Amish Macaroni Salad,https://www.allrecipes.com/recipe/74915/amish-...,Amish and Mennonite,"[2 cups uncooked elbow macaroni, 3 large hard-..."
3,Amish Friendship Bread Starter,https://www.allrecipes.com/recipe/7063/amish-f...,Amish and Mennonite,"[1 (.25 ounce) package active dry yeast, ¼ cup..."
4,My Amish Friend's Caramel Corn,https://www.allrecipes.com/recipe/74950/my-ami...,Amish and Mennonite,"[7 quarts plain popped popcorn, 2 cups dry roa..."
...,...,...,...,...
2316,Vietnamese Grilled Pork Skewers,https://www.allrecipes.com/recipe/261122/vietn...,Vietnamese,"[1 pound pork belly, cubed, 1 fresh red chile..."
2317,Goi Ga (Vietnamese Chicken and Cabbage Salad),https://www.allrecipes.com/recipe/271155/goi-g...,Vietnamese,"[4 skinless cooked chicken breasts, shredded,..."
2318,Vietnamese Fresh Spring Rolls,https://www.allrecipes.com/recipe/24239/vietna...,Vietnamese,"[2 ounces rice vermicelli, 8 rice wrappers (8..."
2319,Pho (Vietnamese Noodle Soup),https://www.allrecipes.com/recipe/228443/authe...,Vietnamese,"[4 pounds beef soup bones (shank and knee), 1 ..."


In [6]:
recipes_df.to_csv('recipes.csv', index=False)

## Clean Recipe Ingredients
First, we want to clean the ingredients list into a better format since each recipe listed the ingredients differently. The best way to do this is to make a list of key ingredients and then search the list for these common ingredients that out model will be able recognize from the images input by the user. Then, find he items from that list in the recipe ingredients and create a new column in the dataframe to hold these key ingredients for each recipe.

In [7]:
df = pd.read_csv("/content/drive/MyDrive/data folder/recipes.csv")
df.head()

Unnamed: 0,Name,URL,Cuisine,Ingredients
0,Best Vinegar Coleslaw,https://www.allrecipes.com/recipe/59318/amish-...,Amish and Mennonite,"['1 large head cabbage, cored and finely shred..."
1,Pennsylvania Dutch Pickled Beets and Eggs,https://www.allrecipes.com/recipe/13743/pennsy...,Amish and Mennonite,"['8 large eggs', '2 (15 ounce) cans whole pick..."
2,Amish Macaroni Salad,https://www.allrecipes.com/recipe/74915/amish-...,Amish and Mennonite,"['2 cups uncooked elbow macaroni', '3 large ha..."
3,Amish Friendship Bread Starter,https://www.allrecipes.com/recipe/7063/amish-f...,Amish and Mennonite,"['1 (.25 ounce) package active dry yeast', '¼ ..."
4,My Amish Friend's Caramel Corn,https://www.allrecipes.com/recipe/74950/my-ami...,Amish and Mennonite,"['7 quarts plain popped popcorn', '2 cups dry ..."


Import the list of key ingredients and put them into a list of strings:

In [None]:
# Open the file in read mode
with open('/content/drive/MyDrive/data folder/ingredients.txt', 'r') as file:
    # Read the lines of the file
    lines = file.readlines()

# Strip newline characters to clean up the strings
# and create the list of strings
key_ingredients = [line.strip().lower() for line in lines]

print(key_ingredients)  # Output: ['apple', 'banana', 'orange', 'grape']

['artichoke', 'asparagus', 'broccoli', 'brussels sprouts', 'cabbage', 'cauliflower', 'celery', 'cilantro', 'eggplant', 'bok choy', 'lettuce', 'spinach', 'beans', 'chickpeas', 'green beans', 'lentils', 'peas', 'mushrooms', 'garlic', 'onion', 'bell pepper', 'hot pepper', 'beet', 'carrot', 'radish', 'turnip', 'potato', 'corn', 'squash', 'zucchini', 'cucumber', 'tomato', 'olives', 'rice', 'banana', 'apple', 'mango', 'grapes', 'cherries', 'rasberries', 'strawberries', 'blueberries', 'peach', 'plum', 'nectarine', 'apricot', 'blackberries', 'watermelon', 'pineapple', 'orange', 'avocado', 'lemon and lime', 'chicken', 'eggs', 'beef', 'pork', 'turkey', 'sausage', 'milk', 'cheese', 'butter', 'salmon', 'flour', 'sugar', 'chocolate']


The csv reads the list of ingredients as a string so we need to remove all punctuation and search the ingredients for the key ingredients.

In [None]:
import string

def remove_punctuation(text):
  '''
  - Removes the punctuation from the string of recipe ingredients
  - Input: string of ingredients
  - Output: the cleaned string
  '''
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

def find_key_ingredients(df, key_ingredients):
  '''
  - Compares the lists of ingredients and creates a new column in the dataframe
    with a list of key ingredients for each recipe
  - Input: dataframe of recipes, list of key ingredients
  - Output: dataframe with new column of key ingredients
  '''

  # Function to check if any key ingredient is found in a list of ingredients
  def find_word_in_string(text):
    '''
    - Finds key ingredients in ingredient string
    - Input: text from ingredient column of df
    - Output: List of key ingredients found in ingredient column
    '''
    R = []
    for i in key_ingredients:
      if i in text:
        # a lot of recipes use corn starch and corn flour
        if i == 'corn' and ('corn starch' in text or 'corn flour' in text):
          break;
        R.append(i)
        # Sometimes only need one egg
        if i == 'eggs' and 'egg' in text and 'eggs' not in R:
          R.append(i)
    return R

  # Remove punctuation
  df['Ingredients'] = df['Ingredients'].apply(remove_punctuation)

  # Find key ingredients in each row
  df['key_ingredients'] = df['Ingredients'].apply(find_word_in_string)
  return df

In [None]:
new_df = find_key_ingredients(df, key_ingredients)
new_df.head()

Unnamed: 0,Name,URL,Cuisine,Ingredients,key_ingredients
0,Best Vinegar Coleslaw,https://www.allrecipes.com/recipe/59318/amish-...,Amish and Mennonite,1 large head cabbage cored and finely shredded...,"[cabbage, celery, onion, sugar]"
1,Pennsylvania Dutch Pickled Beets and Eggs,https://www.allrecipes.com/recipe/13743/pennsy...,Amish and Mennonite,8 large eggs 2 15 ounce cans whole pickled bee...,"[onion, beet, eggs, sugar]"
2,Amish Macaroni Salad,https://www.allrecipes.com/recipe/74915/amish-...,Amish and Mennonite,2 cups uncooked elbow macaroni 3 large hardcoo...,"[celery, onion, bell pepper, eggs, sugar]"
3,Amish Friendship Bread Starter,https://www.allrecipes.com/recipe/7063/amish-f...,Amish and Mennonite,1 25 ounce package active dry yeast ¼ cup warm...,"[milk, flour, sugar]"
4,My Amish Friend's Caramel Corn,https://www.allrecipes.com/recipe/74950/my-ami...,Amish and Mennonite,7 quarts plain popped popcorn 2 cups dry roast...,"[corn, sugar]"


Upload new dataframe of recipes with key ingredients to a csv file:

In [None]:
new_df.to_csv('Recipes_cleaned.csv', index=False)