## Imports

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json

## Variables

In [4]:
urlBase = 'http://allrecipes.com'
urlCategories = urlBase + '/recipes/?grouping=all'
dataLocation = 'categories.json'

## Functions (Auxiliar)

In [5]:
#/recipes/86/world-cuisine/
#/recipes/233/world-cuisine/asian/
def validLink(urlBase, urlLink):
    out = True
    u1 = urlBase[1:-1].split('/')
    u2 = urlLink[1:-1].split('/')
    if ((len(u2) - len(u1)) == 1):
        for i in range(2, len(u1)):
            if (u1[i] != u2[i]):
                out = False
            
    
    else: out = False

    return out   
validLink('/recipes/86/', '/recipes/233/world-cuisine/')


def branchAlreadyExists(url, url_list):
    out = False
    n = len(url_list)
    for i in range(n):
        elm = url_list[i]
        if (elm['url'] == url):
            out = True
    
    return out

## Functions (Scrap categories)

In [4]:
def provideTreeLevel0(tree):
    
    treeHasChanged = False
    if(tree == None):
        print('impossible to set the tree')
    elif(not tree): # the dict is empty 
        tree['title'] = 'AllRecipes'
        tree['url'] = urlBase
        tree['children'] = None
        treeHasChanged = True
    else:
        print('Please enter an empty dict')
    return treeHasChanged
    

def provideTreeLevel1(tree):
    
    treeHasChanged = False    
    url = urlCategories
    
    if (tree['children'] == None):
        
        # REQUEST ----------------------------------------------
        categoriesHTML = requests.get(url).text
        # ------------------------------------------------------

        categoriesSoup = bs(categoriesHTML, 'html.parser')
        cat = categoriesSoup.find('section', {'id':"herolinks"})
        
        if (cat != None):
            cat = cat.find('div', {'class':"grid ng-hide", 'ng-show':"showAll===true"})
            if (cat != None):
                links = cat.find_all('a', {'class':"hero-link__item"})
                n = len(links)
                children = []
                for i in range(n):
                    link = links[i]
                    dic = {'title': link['title'], 'url': link['href'], 'children': None}
                    children.append(dic)

                tree['children'] = children
            else:
                tree['children'] = 'EMPTY'
        else:
            tree['children'] = 'EMPTY'
            
        treeHasChanged = True
        print('New entry in the dataset:')
        print(url)
        
    else:
        # Nothing to do
        print('Data is already present: nothing to do !')
        
    return treeHasChanged


def provideTreeLevel2(tree):
    
    treeHasChanged = False
    url = urlBase + tree['url']
    
    if (tree['children'] == None):
        
        # REQUEST ----------------------------------------------
        subCategoriesHTML = requests.get(url).text
        # ------------------------------------------------------
        
        subCategoriesSoup = bs(subCategoriesHTML, 'html.parser')
        subCat = subCategoriesSoup.find('section', {'class':"hub-daughters"})
        
        if (subCat != None):
            subCat = subCat.find('div', {'id':"hubDaughtersDiv"})
            if(subCat != None):

                links = subCat.find_all('a', {'data-internal-referrer-link':"hub nav"})
                m = len(links)
                subData = []
                for j in range(m):
                    link = links[j]
                    if (validLink(tree['url'], link['href'])):
                        if (not branchAlreadyExists(link['href'], subData)):
                            img = link.find('img')
                            dic = {'title': img['title'], 'url': link['href'], 'children': None}
                            subData.append(dic)
                        else:
                            print('the url already exists')
                    else:
                        print('the url is not a direct sub-category')

                tree['children'] = subData
            else:
                tree['children'] = 'EMPTY'
        else:
            tree['children'] = 'EMPTY'
        
        treeHasChanged = True
        print('New entry in the dataset:')
        print(url)
    
    else:
        # Nothing to do
        pass
    
    return treeHasChanged

## Functions (Iteration over branches)

We will preform multiple network calls, and requests can fail. We don't want to saturate the website so we will fetch that we don't have !

Fetch all categories if it is necessary:
    * If 'children' value is at None, fetching as to be performed
    * If 'children' value is provided, nothing to fetch
    * If 'children' value is the string 'EMPTY', there is no child, nothing to fetch
    

In [5]:
def BrowseTreeAuxi(tree, depth, nb_request, maxDepth, maxRequest):
    
    treeHasChanged = False
    nb_local_request = nb_request
    
    if (nb_request < maxRequest):

        if(not tree):
            print('to process {level}'.format(level=depth))
            treeHasChanged = provideTreeLevel0(tree)

        elif(tree['children'] == 'EMPTY'):
            # Do nothing
            pass
        elif(tree['children'] == None):

            if(depth == 0):
                print('At depth {level}:'.format(level=depth))
                treeHasChanged = provideTreeLevel1(tree)
                nb_local_request = nb_local_request + 1

            elif(depth > 0):
                print('At depth {level}:'.format(level=depth))
                treeHasChanged = provideTreeLevel2(tree)
                nb_local_request = nb_local_request + 1
                

        else:
            if (depth < maxDepth): 
                for (i, child) in enumerate(tree['children']):
                    change, nb = BrowseTreeAuxi(child, depth+1, nb_local_request, maxDepth, maxRequest)
                    treeHasChanged = (treeHasChanged or change)
                    nb_local_request = nb
                    

        if(treeHasChanged):
            # re-browse
            print('re-browse')
            change, nb = BrowseTreeAuxi(tree, depth, nb_local_request, maxDepth, maxRequest)
            treeHasChanged = (treeHasChanged or change)
            nb_local_request = nb
        else:
            # Do nothing
            pass
        
    else:
        # Do nothing
        print('Too many requests ({nb}) have been done'.format(nb=nb_request))
    
    return treeHasChanged, nb_local_request

def BrowseTree(tree, maxDepth=10, maxRequest=12):
    return BrowseTreeAuxi(tree, 0, 0, maxDepth, maxRequest)

## Let's start !

In [6]:
data = {}

# Import existing data
try:
    with open(dataLocation, 'r') as file:
        data = json.load(file)
        file.close()
except IOError as e:
    print('There is no data to import, a default data structure is created')


In [7]:
BrowseTree(data, maxDepth=10, maxRequest=500)

(False, 0)

In [8]:
# Export data
with open(dataLocation, 'w') as file:
    json.dump(data, file, sort_keys=True, indent=4)

number of requests by depth
    * 0 -> 1
    * 1 -> 23
    * 2 -> 263
    * 3 -> 1255
    * 4 -> 877
    * 5 -> 112
    * 6 -> 8
    * 7 -> 0