## Imports

In [22]:
import numpy as np
import pandas as pd
import requests
import json

In [23]:
metaDataLocation = 'meta-lists.json'
dataLocation = 'lists.json'

In [24]:
def extractCategoryId(url):
    prefix = 'http://allrecipes.com'
    if url.startswith(prefix):
        u = url[len(prefix):]
    else:
        u = url
    if (len(u)<2): return -1  
    split = u[1:-1].split('/')
    return int(split[1])

def formUrl(params):
    url = 'https://apps.allrecipes.com/v1/assets/hub-feed?id=[id]&pageNumber=[page]&isSponsored=[sponsored]&sortType=[sort]'
    url = url.replace('[id]', str(params['id']))
    url = url.replace('[page]', str(params['page']))
    url = url.replace('[sponsored]', str(params['sponsored']))
    url = url.replace('[sort]', str(params['sort']))
    return url

def formCatUrl(url):
    prefix = 'http://allrecipes.com'
    return prefix + url

## Import data functions

In [25]:
def importData(path):
    data = []
    # Import existing data
    try:
        with open(path, 'r') as file:
            data = json.load(file)
            file.close()
    except IOError as e:
        print('There is no data to import, a default data structure is created')
    return data

def exportData(data, path):
    with open(path, 'w') as file:
        json.dump(data, file, sort_keys=True, indent=4)

## Metadata functions

In [26]:
def importMD(path):
    
    metadata = []
    # Import existing data
    try:
        with open(path, 'r') as file:
            metadata = json.load(file)
            file.close()
    except IOError as e:
        print('There is no meta-data to import, a default data structure is created')
    return metadata

def exportMD(data, path):
    with open(path, 'w') as file:
        json.dump(data, file, sort_keys=True, indent=4)
    
def isCategoryExist(metadata, id):
    
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            return True
    return False

def addCategoryMD(metadata, url, title, id):
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            category = cat
            break;
    if (category != None):
        print('The category already exists')
    else:
        cell = {'url':url, 'title':title, 'id':id, 'max':None, 'pages':[], 'recipeIds':[]}
        metadata.append(cell)

def addEntryMD(metadata, id, page):
    
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            category = cat
            break;
    
    if (category != None):
        if (page not in category['pages']):
            category['pages'].append(page)
        else:
            pass
    else:
        print('the category does not exist')
        
    
def isPresentMD(metadata, id, page):
    
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            category = cat
            break;
    
    if (category != None):
        return (page in category['pages'])
    else:
        return False


## Tree functions

In [27]:
def importTree(path):
    tree = None
    try:
        with open(path, 'r') as file:
            tree = json.load(file)
            file.close()
    except IOError as e:
        print('There is no data to import')
    return tree

def fromTreeToMD(tree, metadata, recursive=False):
    
    url = tree['url']
    id = extractCategoryId(url)
    title = tree['title']
    addCategoryMD(metadata, url, title, id)

    if(recursive):
        children = tree['children']
        if(children == None):
            pass
        elif(children == 'EMPTY'):
            pass
        elif(len(children) > 0):
            for (j, child) in enumerate(children):
                fromTreeToMD(child, metadata, True)
    else:
        pass
    
def findSubTreeByUrl(tree, url):
    if (tree == 'EMPTY'):
        return None
    elif (tree == None):
        return None
    else:
        if(tree['url'] == url):
            return tree
        else:
            if (tree['children'] == 'EMPTY'):
                return None
            elif (tree['children'] == None):
                return None
            else:
                for (i, child) in enumerate(tree['children']):
                    sub = findSubTreeByUrl(child, url)
                    if (sub != None): return sub

## Data storage functions

In [28]:
def isRecipePresent(data, recipeId):
    for (i, recipe) in enumerate(data):
        if(recipe['id'] == recipeId):
            return True
    return False

def treatNewData(data, recipeList, category):
    
    for (i, recipe) in enumerate(recipeList):
        
        if(recipe['itemType'] == 'Recipe'):
            
            recipeId = recipe['id']
            
            # Add recipe to list
            if(isRecipePresent(data, recipeId)):
                #print('\t\tDouble entry found')
                pass
            else:
                data.append(recipe)

            # Add id to metadata
            if (recipeId not in category['recipeIds']):
                category['recipeIds'].append(recipeId)
            else:
                pass
       
        else:
            #print('\t\tThis entry is not a recipe')
            pass
        

## Request functions

In [29]:
def getPage(urlSession, session, params):
    token = session.cookies.get('ARToken')
    headers = {
        'Origin':'http://allrecipes.com',
        'X-Requested-With':'XMLHttpRequest',
        'Authorization':'Bearer ' + token,
        'Accept':'*/*',
        'Referer':urlSession
    }
    url = formUrl(params)
    # REQUEST ----------------------------------------------
    r = requests.get(url, headers=headers)
    # ------------------------------------------------------
    return r

def getPages(data, category, max_requests):
    
    nbRequests = 0
    
    if(category['max'] == None):
        urlSession = formCatUrl(category['url'])
        s = requests.Session()
        # REQUEST ----------------------------------------------
        s.get(urlSession)
        # ------------------------------------------------------
        nbRequests = nbRequests + 1
        print('New session at {url}'.format(url=urlSession))
        
        iteration = 1
        while(category['max'] == None and iteration < 10000):
            if (iteration not in category['pages']):
                
                if (nbRequests < max_requests):
                    params = {'id':category['id'], 'page':iteration, 'sponsored':'true', 'sort':'t'}
                    # REQUEST ----------------------------------------------
                    r = getPage(urlSession, s, params)
                    # ------------------------------------------------------
                    nbRequests = nbRequests + 1
                    print('\tPage {page} provided'.format(page=iteration))
                    
                else:
                    print('Too many requests have been done')
                    break
                if (r.status_code == 200):
                    response = r.json()
                    recipeList = response['cards']
                    
                    if (len(recipeList) == 0):
                        category['max'] = iteration-1
                        
                    else:
                        treatNewData(data, recipeList, category)
                        category['pages'].append(iteration)
                        
            else:
                print('Page {page} already presents'.format(page=iteration))
                pass
            iteration = iteration + 1
            
        
    else:
        print('This category is fully provided')
        pass

    return nbRequests
    
def getCategories(data, metadata, max_requests):
    nb_requests = 0
    for (i, category) in enumerate(metadata):
        if(nb_requests < max_requests):
            nb = getPages(data, category, max_requests-nb_requests)
            nb_requests = nb_requests + nb
        else:
            break
            
    return nb_requests

## Let's start !

In [30]:
metadataPath = 'metadata-WC-byTitle.json'
dataPath = 'recipeList-WC-byTitle.json'
catUrl = '/recipes/86/world-cuisine/'

tree = importTree('categories.json')
metadata = importMD(metadataPath)
data = importData(dataPath)

subtree = findSubTreeByUrl(tree, catUrl)
fromTreeToMD(subtree, metadata, True)

There is no meta-data to import, a default data structure is created
There is no data to import, a default data structure is created


In [31]:
# REQUESTS ----------------------------------------------
getCategories(data, metadata, 2000)
# -------------------------------------------------------

New session at http://allrecipes.com/recipes/86/world-cuisine/
	Page 1 provided
	Page 2 provided
	Page 3 provided
	Page 4 provided
	Page 5 provided
	Page 6 provided
	Page 7 provided
	Page 8 provided
	Page 9 provided
	Page 10 provided
	Page 11 provided
	Page 12 provided
	Page 13 provided
	Page 14 provided
	Page 15 provided
	Page 16 provided
	Page 17 provided
	Page 18 provided
	Page 19 provided
	Page 20 provided
	Page 21 provided
	Page 22 provided
	Page 23 provided
	Page 24 provided
	Page 25 provided
	Page 26 provided
	Page 27 provided
	Page 28 provided
	Page 29 provided
	Page 30 provided
	Page 31 provided
	Page 32 provided
	Page 33 provided
	Page 34 provided
	Page 35 provided
	Page 36 provided
	Page 37 provided
	Page 38 provided
	Page 39 provided
	Page 40 provided
	Page 41 provided
	Page 42 provided
	Page 43 provided
	Page 44 provided
	Page 45 provided
	Page 46 provided
	Page 47 provided
	Page 48 provided
	Page 49 provided
	Page 50 provided
	Page 51 provided
	Page 52 provided
	Page 53 p

1611

In [32]:
exportMD(metadata, metadataPath)
exportData(data, dataPath)

In [33]:
len(data)

11001