# TODO Markdown title here and explaination

In [1]:
import re
import os.path
import uritools
import numpy as np
import scipy as sp
import pandas as pd
from bs4 import BeautifulSoup

import seaborn as sns
import matplotlib.pyplot as plt

from pyspark.sql import *
import pyspark.sql.functions as f # wierd that I have to do that

import threading

In [2]:
# General parameters
%matplotlib inline
plt.style.use('seaborn')#switch to seaborn style
plt.rcParams["figure.figsize"] = [16,10]

spark = SparkSession.builder.getOrCreate()

DATA_FOLDER = './data/'
RECIPES_PATH = DATA_FOLDER + 'recipePages/'

In [3]:
# TODO: for loop
def test_loader(filename):
    
    html_string = RECIPES_PATH + filename 
    html = open(html_string,'r')
    soup = BeautifulSoup(open(html_string), 'html.parser')
    hostname = uritools.urisplit(soup.find(True,href=True)['href']).gethost()
    
    return soup

**myrecipes.com**

In [4]:
def scrap_myrecipes_com( soup ):
    
    #<li itemprop="ingredient" itemscope="" itemtype="http://data-vocabulary.org/RecipeIngredient">
    #<span itemprop="amount">1 cup</span>
    #<span itemprop="name"> chopped tomato</span>
    #<span itemprop="preparation"> </span>
    #</li>
    # Get All the ingredients
    ingredient_list = list()
    for tag in soup.findAll("li", {"itemprop": re.compile('.*ingredient*', flags=re.IGNORECASE)}):
        ingredient_list.append(re.sub(r'\n|\t|\r|  ', '', tag.text))

    ## How to append to list?
    # If 'banana' doesnt exist in the list -> add mashed ripe banana to the list
    # if afterwards we see banana, by regex matching we should get back mashed ripe banana, in this case we rename
    # the mashed ripe to banana ( comparing the size of the keywords)

    # Fetch nutritional information
    # TODO, we can get WAY MORE if you look just below in the website
    # how can we store all the informations if somes are missing?
    nutritive_info = soup.findAll(True, {"class": re.compile('.*nutri*.', flags=re.IGNORECASE)})

    # Sub Extracting nutritive informations per Serving
    soup_nutrition = BeautifulSoup(str(nutritive_info), 'html.parser')

    # Nutrition values
    nutrition_dict = {}
    for tag in soup_nutrition.findAll('span'):
        nutrition_dict.update({tag.get("itemprop"): re.sub(r'\n|\t|\r|  ', '', tag.text)})

    # Recipe Title
    title = re.sub(r'\n|\t|\r', '',soup.title.text).split('|')[0]

    # Ratings
    rating = soup.find(attrs={"name": "recipe_average_rating"})
    rating = re.findall(r'\d+|\d+\.\d+', str(rating), re.IGNORECASE)[0]
    #n_rating = soup.findAll("span", {"class": "count"})[0].text
    
    return [ (title, ingredient_list, nutrition_dict, rating) ]

In [5]:
## Test
soup = test_loader('1710f9ca5c3a03bfd6688570a5a6a46b.html');
columns = ['Title', 'Ingredients', 'Nutrition', 'Rating']

#
vals = scrap_myrecipes_com(soup)
df = spark.createDataFrame(vals, columns)
df.show()

+--------------------+--------------------+--------------------+------+
|               Title|         Ingredients|           Nutrition|Rating|
+--------------------+--------------------+--------------------+------+
|Quick Roasted-Veg...|[2 1/2 cups julie...|[fiber -> 7.8g, c...|     0|
+--------------------+--------------------+--------------------+------+



**allrecipe.com**

In [6]:
def scrap_AllRecipe_com( soup ):
    # Get All the ingredients
    ingredient_list = list()
    for tag in soup.findAll("li", {"class": re.compile('.*ingredient*', flags=re.IGNORECASE)}):
        ingredient_list.append(re.sub(r'\n|\t|\r|  ', '', tag.text))

    ## How to append to list?
    # If 'banana' doesnt exist in the list -> add mashed ripe banana to the list
    # if afterwards we see banana, by regex matching we should get back mashed ripe banana, in this case we rename
    # the mashed ripe to banana ( comparing the size of the keywords)

    # Fetch nutritional information
    # TODO, we can get WAY MORE if you look just below in the website
    # how can we store all the informations if somes are missing?
    nutritive_info = soup.findAll(True, {"id": re.compile('.*nutri*.', flags=re.IGNORECASE)})

    # Sub Extracting nutritive informations per Serving
    soup_nutrition = BeautifulSoup(str(nutritive_info), 'html.parser')

    # Nutrition values
    nutrition_dict = {}
    for tag in soup_nutrition.findAll('span'):
        nutrition_dict.update({tag.get("class")[0]: tag.text})

    # Recipe Title
    title = re.sub(r'\n|\t|\r|  ', '', soup_nutrition.findAll('div', {'class': 'rectitle'})[0].text)

    # Ratings
    rating = re.search(r'AverageRating":(\d+\.\d+)',soup.text, re.IGNORECASE).group(1)
    n_rating = soup.findAll("span", {"class": "count"})[0].text

    #
    return [ (title, ingredient_list, nutrition_dict, rating) ]

In [7]:
## Test
soup = test_loader('0a7e6e2cae6d4da800d13ef59e760dd3.html');
vals = scrap_AllRecipe_com(soup)

# create DataFrame
newRow = spark.createDataFrame(vals)
df = df.union(newRow)
df.show()
soup

+--------------------+--------------------+--------------------+------+
|               Title|         Ingredients|           Nutrition|Rating|
+--------------------+--------------------+--------------------+------+
|Quick Roasted-Veg...|[2 1/2 cups julie...|[fiber -> 7.8g, c...|     0|
|    Banana Muffins I|[1 cup all-purpos...|[sodium -> 194mg,...|   4.3|
+--------------------+--------------------+--------------------+------+



<!-- 0a7e6e2cae6d4da800d13ef59e760dd3.html http://allrecipes.com/Recipe/banana-muffins-i/detail.aspx //-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<!--[if lt IE 7 ]> <html class="ie6" xmlns="http://www.w3.org/1999/xhtml"> <![endif]-->
<!--[if IE 7 ]>    <html class="ie7" xmlns="http://www.w3.org/1999/xhtml"> <![endif]-->
<!--[if IE 8 ]>    <html class="ie8" xmlns="http://www.w3.org/1999/xhtml"> <![endif]-->
<!--[if IE 9 ]>    <html class="ie9" xmlns="http://www.w3.org/1999/xhtml"> <![endif]-->
<!--[if (gt IE 9)|!(IE)]><!--> <html xmlns="http://www.w3.org/1999/xhtml"> <!--<![endif]-->
<!-- ARLOG SERVER:WEB704 LOCAL_IP: 192.168.5.174 REMOTE_IP:131.107.192.193 TYPESPECIFICID: 20933 MERCH_KEY: MerchData_4_1_1_0_***_10_16_18_34_35_36_38_43_47_48_49_50_51_59 -->
<head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content='(pics-1.1 "http://www.icra.org/pics/vocabularyv03/" l gen tru

**Flow chart**
- Check every website name <- **done, some are not really helpful**
- create for each website a dedicated function to fetch informations
- Keep in mind to avoid empty recipes website ( as index pages)
- save all this to a pysprk parquet file

#### Started at 14:05

In [8]:
def foo_gla(listfiles, df, idx):

    i = 0
    for filename in listfiles:
    
        # Opening HTML file
        with open(RECIPES_PATH + filename, 'rb') as file:
            try:
                # Parsing the HTML file
                soup = BeautifulSoup(file, 'html.parser')
    
                # Retrieve the HostName
                hostname = uritools.urisplit(soup.find('a',href=True)['href']).gethost()
    
                # Realising corresponding scraping
                # TODO switch case???
                if(hostname == 'www.myrecipes.com'):
                    vals = scrap_myrecipes_com(soup)
                    # create DataFrame
                    newRow = spark.createDataFrame(vals)
                    df = df.union(newRow)
                
                elif(hostname == 'allrecipes.com'):
                    vals = scrap_AllRecipe_com(soup)
                    # create DataFrame
                    newRow = spark.createDataFrame(vals)
                    df = df.union(newRow)
            
            except : # whatever reader errors you care about
                continue
        i += 1
        if(i % 500 == 0):
            print(i)
    df.write.format('parquet').save(DATA_FOLDER + 'pyspark_df_'+str(idx)+'.parquet', mode='overwrite')

In [13]:
# For each file we first look at every possiblr website
files = os.listdir(RECIPES_PATH)

k = len(files)-1
N = 10
k_N = int(k/N)
for i in range(N):
    t = threading.Thread(target=foo_gla, args=(files[i*k_N:(i+1)*k_N],df, i))
    t.start()

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [14]:
# TODO: AWESOME!!! FOOD SUBSTITIONS GONNA CHECK FOR OTHER WEBSITE
# http://thatsmyhome.com/food-substitutions/
#df.toPandas()

500
500
500
500
500
500
500
500
500
500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1000
1000
1000
1000
1000
1000
1000
1000
1000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1500
1500
1500
1500
1500
1500
1500
1500
1500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2000
2000
2000
2000
2000
2000
2000
2000
2000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2500
2500
2500
2500
2500
2500
2500
2500
2500
3000
3000
3000
3000
3000
3000
3000
1000
3000
3000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3500
3500
3500
3500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3500
3500
3500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3500
1500
3500
4000
4000
4000
4000
4000
4000
4000
2000
4000
4000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4500
4500
4500
4500
4500
4500
2500
4500
4500
4500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5000
5000
5000
5000
3000
5000
5000
5000
5000
5000
5500
3500
5500
5500
5500
5500
5500
5500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5500
5500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6000
6000
6000
6000
6000
6000
6000
6000
6000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4500
6500
6500
6500
6500
6500
6500
6500
6500
6500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5000
7000
7000
7000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


7000
7000
7000
7000
7000
7000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


5500
7500
7500
7500
7500
7500
7500
7500
7500
7500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6000
8000
8000
8000
8000
8000
8000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


8000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


8000
8000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6500
8500
8500
8500
8500
8500
8500
8500
8500


In [11]:
df.count()

2

In [12]:
filenames = os.listdir(RECIPES_PATH)
open_files = map(open, [ RECIPES_PATH + filename for filename in filenames])
for filename in open_files:


SyntaxError: unexpected EOF while parsing (<ipython-input-12-882600125cce>, line 3)

In [None]:
filename.read