<a href="https://colab.research.google.com/github/TanushGoel/Gordon-RamsAI/blob/master/Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Food.com V1

In [None]:
import requests
from bs4 import BeautifulSoup, SoupStrainer

recipes = []
strainer = SoupStrainer()

for recipe_num in range(38, 50000):

    try:

        resp = requests.get("https://www.food.com/recipe/low-fat-berry-blue-frozen-dessert-" + str(recipe_num)) # start url
        soup = BeautifulSoup(resp.content, features="html.parser", parse_only=strainer)

        ingredients = ""
        directions = ""

        for li in soup.find('div', attrs={'class':'recipe-layout__content-left recipe-layout__truncated-element'}).findAll('li'):
                
            partsContainer = li.findAll('span', attrs={'class':'recipe-ingredients__ingredient-part'})
            parts = ""
            for part in partsContainer:
                ingred = part.contents[0].text
                if ingred[0] != ",":
                    parts += ingred.strip() + " "

            ingredients += parts

        for li in soup.find('div', attrs={'class':'recipe-layout__content-right recipe-layout__truncated-element'}).findAll('li', attrs={'class':'recipe-directions__step'}):
            directions += li.text + " "

        recipes.append({
            "name": soup.find('div', attrs={'class':'recipe-title'}).contents[0].text.lower(),
            "ingredients": ingredients.strip().lower(),
            "directions": directions.strip().lower(),
        })

    except KeyboardInterrupt:

        print("number:", recipe_num)
        break
    
    else:
        continue

print(len(recipes), "total recipes scraped")

In [None]:
import pandas as pd

recipes = pd.DataFrame.from_dict(recipes)

measure_words = ["cup", "fluid ounce", "fl", "ounce", "tablespoon", "teaspoon", "t", "tb", "tsp","tbsp", "gill", "pint", "quart", "qt", "gallon", "ml", "milliliter", "millilitre", "cc", "l", 
                 "liter", "litre", "dl", "deciliter", "decilitre", "pound", "lb", "ounce", "oz", "mg", "milligram", "milligramme", "gram", "g", "gramme", "kilogram", "kilogramme", "kg"]

def remove_measures(text):
    text = [word for word in text.split() if word not in measure_words and word not in [item + 's' for item in measure_words] and word not in [item + 'es' for item in measure_words]]
    return ' '.join(text)

recipes.ingredients = recipes.ingredients.apply(remove_measures)

recipes.head()

In [None]:
recipes.to_csv("recipes_webscraped.csv", index=False)
from google.colab import files
files.download("recipes_webscraped.csv")

# Food.com V2

In [None]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import concurrent.futures
import time

recipe_urls = []
for recipe_num in range(38, 541006): # first recipe id, last recipe id
    recipe_urls.append("https://www.food.com/recipe/low-fat-berry-blue-frozen-dessert-" + str(recipe_num))

def scrape(url):

        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, features="html.parser")

        ingredients = ""
        directions = ""

        for li in soup.find('div', attrs={'class':'recipe-layout__content-left recipe-layout__truncated-element'}).findAll('li'):
                
            partsContainer = li.findAll('span', attrs={'class':'recipe-ingredients__ingredient-part'})
            parts = ""
            for part in partsContainer:
                ingred = part.contents[0].text
                if ingred[0] != ",":
                    parts += ingred.strip() + " "

            ingredients += parts

        for li in soup.find('div', attrs={'class':'recipe-layout__content-right recipe-layout__truncated-element'}).findAll('li', attrs={'class':'recipe-directions__step'}):
            directions += li.text + " "

        return {"name": soup.find('div', attrs={'class':'recipe-title'}).contents[0].text.lower(),
                "ingredients": ingredients.strip().lower(),
                "directions": directions.strip().lower()}

def web_scrape(workers=18):

    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
          futures = [executor.submit(scrape, url) for url in recipe_urls]

          results = []
          for future in concurrent.futures.as_completed(futures):
              try:
                  results.append(future.result())
              except Exception as exc:
                  continue

    return results

t0 = time.time()
recipes = web_scrape()
t1 = time.time()
print(f"{t1-t0} seconds to parse {len(recipe_urls)} recipes")

14728.549757957458 seconds to parse 270503 recipes


In [None]:
import pandas as pd

recipes = pd.DataFrame.from_dict(recipes)

measure_words = ["cup", "fluid ounce", "fl", "ounce", "tablespoon", "teaspoon", "t", "tb", "tsp","tbsp", "gill", "pint", "quart", "qt", "gallon", "ml", "milliliter", "millilitre", "cc", "l", 
                 "liter", "litre", "dl", "deciliter", "decilitre", "pound", "lb", "ounce", "oz", "mg", "milligram", "milligramme", "gram", "g", "gramme", "kilogram", "kilogramme", "kg", "pinch"]

def remove_measures(text):
    text = ''.join([i for i in str(text) if not i.isdigit()])
    text = text.replace("(", "").replace(")", "").replace(",", "")
    text = [word for word in text.split() if word not in measure_words and word not in [item + 's' for item in measure_words] and word not in [item + 'es' for item in measure_words]]
    return ' '.join(text)

recipes.ingredients = recipes.ingredients.apply(remove_measures)

In [None]:
import nltk
!python -m nltk.downloader all -q
from textblob import TextBlob

def extract_nouns(text):

    # "milk low-fat plain yogurt orange juice banana honey depending on how sweet you like your smoothies pure vanilla extract" --> "milk low-fat plain yogurt orange juice banana honey pure vanilla extract"

    return " ".join(TextBlob(text).noun_phrases)

recipes.ingredients = recipes.ingredients.apply(extract_nouns)

In [None]:
names = []
ingreds = []
directions = []

for i in range(len(recipes)):

    try:
        name = recipes.name.iloc[i].encode('utf-8','surrogatepass').decode('utf-8')
        ingred = recipes.ingredients.iloc[i].encode('utf-8','surrogatepass').decode('utf-8')
        direction = recipes.directions.iloc[i].encode('utf-8','surrogatepass').decode('utf-8')
        names.append(name)
        ingreds.append(ingred)
        directions.append(direction)

    except:
        continue

recipes = pd.DataFrame({"name": names, 
                        "ingredients": ingreds,
                        "directions": directions})

del names, ingreds, directions

recipes.head()

In [None]:
recipes.to_csv("recipes_webscraped.csv", index=False)
from google.colab import files
files.download("recipes_webscraped.csv")