# TODO Markdown title here and explaination

In [45]:
import re
import os.path
import uritools
import numpy as np
import scipy as sp
import pandas as pd
from bs4 import BeautifulSoup

import seaborn as sns
import matplotlib.pyplot as plt

from pyspark.sql import *
import pyspark.sql.functions as f # wierd that I have to do that

In [46]:
# General parameters
%matplotlib inline
plt.style.use('seaborn')#switch to seaborn style
plt.rcParams["figure.figsize"] = [16,10]

#spark = SparkSession.builder.getOrCreate()

DATA_FOLDER = './data/'
RECIPES_PATH = DATA_FOLDER + 'recipePages/'

In [47]:
# TODO: for loop
def test_loader(filename):
    
    html_string = RECIPES_PATH + filename 
    html = open(html_string,'r')
    soup = BeautifulSoup(open(html_string), 'html.parser')
    print(re.sub(r'\n|\t', '',soup.title.text))
    
    return soup

**Flow chart**
- Check every website name <- **done, some are not really helpful**
- create for each website a dedicated function to fetch informations
- Keep in mind to avoid empty recipes website ( as index pages)
- save all this to a pysprk parquet file 

**myrecipes.com**

In [131]:
def scrap_myrecipes_com( soup ):
    
    #<li itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient">
    #<span itemprop="amount">1 cup</span>
    #<span itemprop="name"> chopped tomato</span>
    #<span itemprop="preparation"> </span>
    #</li>
    # Get All the ingredients
    ingredient_list = list()
    for tag in soup.findAll("li", {"itemprop": re.compile('.*ingredient*', flags=re.IGNORECASE)}):
        ingredient_list.append(re.sub(r'\n|  ', '', tag.text))

    ## How to append to list?
    # If 'banana' doesnt exist in the list -> add mashed ripe banana to the list
    # if afterwards we see banana, by regex matching we should get back mashed ripe banana, in this case we rename
    # the mashed ripe to banana ( comparing the size of the keywords)

    # Fetch nutritional information
    # TODO, we can get WAY MORE if you look just below in the website
    # how can we store all the informations if somes are missing?
    soup.findAll(True, {"class": re.compile('.*nutri*.', flags=re.IGNORECASE)})

    # Sub Extracting nutritive informations per Serving
    soup_nutrition = BeautifulSoup(str(nutritive_info), 'html.parser')

    # Nutrition values
    nutrition_dict = {}
    for tag in soup_nutrition.findAll('span'):
        nutrition_dict.update({tag.get("class")[0]: tag.text})

    # Recipe Title
    title = re.sub(r'\n|\t', '',soup.title.text).split('|')[0]

    # Ratings
    rating = soup.find(attrs={"name": "recipe_average_rating"})
    rating = re.findall(r'\d+|\d+\.\d+', str(rating), re.IGNORECASE)[0]
    #n_rating = soup.findAll("span", {"class": "count"})[0].text
    
    return title, ingredient_list, nutrition_dict, rating

In [132]:
## Test
soup = test_loader('1710f9ca5c3a03bfd6688570a5a6a46b.html');
scrap_myrecipes_com(soup)

Quick Roasted-Vegetable Fajitas Recipe | MyRecipes.com


('Quick Roasted-Vegetable Fajitas Recipe ',
 ['2 1/2 cups julienne-cut zucchini ',
  '2 cups julienne-cut yellow squash ',
  '2 cups red bell pepper strips ',
  '1 1/2 cups vertically sliced red onion ',
  '3 tablespoons vegetable soup and dip mix (such as Lipton Recipe Secrets) ',
  '4 teaspoons olive oil ',
  '8  (8-inch) flour tortillas ',
  '1  (16-ounce) can fat-free refried beans ',
  '2 cups shredded leaf lettuce ',
  '1 cup (4 ounces) reduced-fat shredded cheddar cheese ',
  '1 cup chopped tomato ',
  '1/2 cup bottled salsa '],
 {'calories': '88',
  'fat': '1.6g',
  'cholesterol': '20mg',
  'sodium': '194mg',
  'totalcarbs': '16.9g',
  'dietaryfiber': '0.8g',
  'protein': '2g'},
 '0')

**cdkitchen.com**

In [162]:
def scrap_cdkitchen_com( soup ):
    
    #<li itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient">
    #<span itemprop="amount">1 cup</span>
    #<span itemprop="name"> chopped tomato</span>
    #<span itemprop="preparation"> </span>
    #</li>
    # Get All the ingredients
    ingredient_list = list() # TODO returns only the ingredient no proportions is retrieved
    for tag in soup.findAll(True, {"itemprop": re.compile('.*ingredient*', flags=re.IGNORECASE)}):
        ingredient_list.append(re.sub(r'\n|  ', '', tag.text))

    print(ingredient_list)
    ## How to append to list?
    # If 'banana' doesnt exist in the list -> add mashed ripe banana to the list
    # if afterwards we see banana, by regex matching we should get back mashed ripe banana, in this case we rename
    # the mashed ripe to banana ( comparing the size of the keywords)

    # Fetch nutritional information
    # TODO, we can get WAY MORE if you look just below in the website
    # how can we store all the informations if somes are missing?
    soup.findAll(True, {"class": re.compile('.*nutri*.', flags=re.IGNORECASE)})

    # Sub Extracting nutritive informations per Serving
    soup_nutrition = BeautifulSoup(str(nutritive_info), 'html.parser')

    # Nutrition values
    nutrition_dict = {}
    for tag in soup_nutrition.findAll('span'):
        nutrition_dict.update({tag.get("class")[0]: tag.text})

    # Recipe Title
    title = re.sub(r'\n|\t', '',soup.title.text).split('|')[0]

    # Ratings
    rating = soup.find(attrs={"name": "recipe_average_rating"})
    rating = re.findall(r'\d+|\d+\.\d+', str(rating), re.IGNORECASE)[0]
    #n_rating = soup.findAll("span", {"class": "count"})[0].text
    
    return title, ingredient_list, nutrition_dict, rating

In [167]:
## Test
soup = test_loader('007d33623f319508995db278ad90de3d.html');
#scrap_cdkitchen_com(soup)

Kentucky Derby Thoroughbred Pie Recipe from CDKitchen.com


**ifood.tv**

def scrap_ifood_tv( soup ):
    
    # Get All the ingredients
    ingredient_list = list()
    for tag in soup.findAll(True, {"itemprop": re.compile('.*ingredient*', flags=re.IGNORECASE)}):
        ingredient_list.append(re.sub(r'\n|  ', '', tag.text))

    print(ingredient_list)
    ## How to append to list?
    # If 'banana' doesnt exist in the list -> add mashed ripe banana to the list
    # if afterwards we see banana, by regex matching we should get back mashed ripe banana, in this case we rename
    # the mashed ripe to banana ( comparing the size of the keywords)

    # Fetch nutritional information
    # TODO, we can get WAY MORE if you look just below in the website
    # how can we store all the informations if somes are missing?
    nutritive_info = soup.findAll(True, {"id": re.compile('.*nutri*.', flags=re.IGNORECASE)})

    # Sub Extracting nutritive informations per Serving
    soup_nutrition = BeautifulSoup(str(nutritive_info), 'html.parser')

    # Nutrition values
    nutrition_dict = {}
    for tag in soup_nutrition.findAll('span'):
        nutrition_dict.update({tag.get("class")[0]: tag.text})

    # Recipe Title
    title = re.sub(r'\n|\t|  ', '', soup_nutrition.findAll('div', {'class': 'rectitle'})[0].text)

    # Ratings
    rating = re.search(r'AverageRating":(\d+\.\d+)',soup.text, re.IGNORECASE).group(1)
    n_rating = soup.findAll("span", {"class": "count"})[0].text

    #
    return title, ingredient_list, nutrition_dict, rating

## Test, TODO bad site
soup = test_loader('004598453f5a3067b9b1e9d8bdc9e630.html');
#scrap_ifood_tv(soup)
soup.findAll(True, {"id": re.compile('.*nutri*.', flags=re.IGNORECASE)})
soup

**allrecipe.com**

In [134]:
def scrap_AllRecipe_com( soup ):
    # Get All the ingredients
    ingredient_list = list()
    for tag in soup.findAll("li", {"class": re.compile('.*ingredient*', flags=re.IGNORECASE)}):
        ingredient_list.append(re.sub(r'\n|  ', '', tag.text))

    ## How to append to list?
    # If 'banana' doesnt exist in the list -> add mashed ripe banana to the list
    # if afterwards we see banana, by regex matching we should get back mashed ripe banana, in this case we rename
    # the mashed ripe to banana ( comparing the size of the keywords)

    # Fetch nutritional information
    # TODO, we can get WAY MORE if you look just below in the website
    # how can we store all the informations if somes are missing?
    nutritive_info = soup.findAll(True, {"id": re.compile('.*nutri*.', flags=re.IGNORECASE)})

    # Sub Extracting nutritive informations per Serving
    soup_nutrition = BeautifulSoup(str(nutritive_info), 'html.parser')

    # Nutrition values
    nutrition_dict = {}
    for tag in soup_nutrition.findAll('span'):
        nutrition_dict.update({tag.get("class")[0]: tag.text})

    # Recipe Title
    title = re.sub(r'\n|\t|  ', '', soup_nutrition.findAll('div', {'class': 'rectitle'})[0].text)

    # Ratings
    rating = re.search(r'AverageRating":(\d+\.\d+)',soup.text, re.IGNORECASE).group(1)
    n_rating = soup.findAll("span", {"class": "count"})[0].text

    #
    return title, ingredient_list, nutrition_dict, rating

In [135]:
## Test
soup = test_loader('0a7e6e2cae6d4da800d13ef59e760dd3.html');
scrap_AllRecipe_com(soup)

Banana Muffins I Recipe - Allrecipes.com


('Banana Muffins I',
 ['1 cup all-purpose flour',
  '1 tablespoon baking powder',
  '1/2 teaspoon baking soda',
  '1/4 teaspoon salt',
  '1 cup mashed ripe banana',
  '1/4 cup white sugar',
  '1/4 cup sour cream',
  '1 egg',
  '1/2 teaspoon vanilla extract'],
 {'calories': '88',
  'fat': '1.6g',
  'cholesterol': '20mg',
  'sodium': '194mg',
  'totalcarbs': '16.9g',
  'dietaryfiber': '0.8g',
  'protein': '2g'},
 '4.3')

In [136]:
# TODO: AWESOME!!! FOOD SUBSTITIONS GONNA CHECK FOR OTHER WEBSITE
# http://thatsmyhome.com/food-substitutions/