In [21]:
from pymongo import MongoClient
import datetime
import numpy as np
import pandas as pd
import requests
import json
import getpass

## List Functions

In [22]:
def unique(a):
    """ return the list with duplicate elements removed """
    return list(set(a))

def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

def union(a, b):
    """ return the union of two lists """
    return list(set(a) | set(b))

## Metadata Functions

In [23]:
def importData(path):
    data = []
    # Import existing data
    try:
        with open(path, 'r') as file:
            data = json.load(file)
            file.close()
    except IOError as e:
        print('There is no data to import, a default data structure is created')
    return data

def importMD(path):
    
    metadata = []
    # Import existing data
    try:
        with open(path, 'r') as file:
            metadata = json.load(file)
            file.close()
    except IOError as e:
        print('There is no meta-data to import, a default data structure is created')
    return metadata

def exportMD(data, path):
    with open(path, 'w') as file:
        json.dump(data, file, sort_keys=True, indent=4)
    
def isCategoryExist(metadata, id):
    
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            return True
    return False

def addCategoryMD(metadata, url, title, id):
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            category = cat
            break;
    if (category != None):
        print('The category already exists')
    else:
        cell = {'url':url, 'title':title, 'id':id, 'max':None, 'pages':[], 'recipeIds':[]}
        metadata.append(cell)

def addEntryMD(metadata, id, page):
    
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            category = cat
            break;
    
    if (category != None):
        if (page not in category['pages']):
            category['pages'].append(page)
        else:
            pass
    else:
        print('the category does not exist')
        
    
def isPresentMD(metadata, id, page):
    
    # Find the category
    category = None
    for (i, cat) in enumerate(metadata):
        if(cat['id'] == id):
            category = cat
            break;
    
    if (category != None):
        return (page in category['pages'])
    else:
        return False

## Mongo Functions

In [24]:
def addRecipeMG(collection, recipe):
    insertion = None
    recipeId = recipe['recipeID']
    if (collection.find_one({'recipeID':recipeId}) == None):
        insertion = collection.insert_one(recipe)
    
    return insertion

## Request Function

In [25]:
def getRecipe(session, id):
    token = session.cookies.get('ARToken')
    headers = {
        'Origin':'http://allrecipes.com',
        'X-Requested-With':'XMLHttpRequest',
        'Authorization':'Bearer ' + token,
        'Accept':'*/*',
        'Referer':'http://allrecipes.com/'
    }
    
    urlRecipe = 'https://apps.allrecipes.com/v1/recipes/{id}'.format(id=id)
    print(urlRecipe)
    # REQUEST ----------------------------------------------
    r = requests.get(urlRecipe, headers=headers)
    # ------------------------------------------------------
    return r

## Let's start

In [26]:
database_name = 'ada-project'
user = input('MongoDB name: ')
password = getpass.getpass('MongoDB password: ')
client = MongoClient('www.cocotte-minute.ovh', 27017)
db = client[database_name]
db.authenticate(user, password)
collection = db['recipes']

metadataPath = 'metadata-WC-byTitle-notSponsored.json'
recipeListPath = 'recipeList-WC-byTitle-notSponsored.json'
metadata = importMD(metadataPath)
recipeList = importData(recipeListPath)

maxRequests = 5000

MongoDB name: admin
MongoDB password: ········


In [27]:
ids = []
for (i, sub) in enumerate(metadata):
    subIds = sub['recipeIds']
    ids = union(ids, subIds)
    
print(len(recipeList))
print(len(ids))

10578
10578


In [28]:
s = requests.Session()
url = 'http://allrecipes.com/'
nb_request = 0
# REQUEST ----------------------------------------------
s.get(url)
# ------------------------------------------------------
nb_request = nb_request + 1
print('New session at {url}'.format(url=url))

New session at http://allrecipes.com/


In [29]:
i = 0
while(nb_request < maxRequests and i < len(ids)):
    recipeId = ids[i]
    if (collection.find_one({'recipeID':recipeId}) == None):
        # REQUEST ----------------------------------------------
        r = getRecipe(s, recipeId)
        # ------------------------------------------------------
        print('request for recipe {id}'.format(id=recipeId))
        nb_request = nb_request + 1
        if(r.status_code == 200):
            addRecipeMG(collection, r.json())
            print('New entry in the database: Recipe {id}'.format(id=recipeId))
        else:
            print('ERROR CODE')
    i = i+1
print(nb_request)

https://apps.allrecipes.com/v1/recipes/229588
request for recipe 229588
New entry in the database: Recipe 229588
https://apps.allrecipes.com/v1/recipes/230718
request for recipe 230718
New entry in the database: Recipe 230718
https://apps.allrecipes.com/v1/recipes/231316
request for recipe 231316
New entry in the database: Recipe 231316
https://apps.allrecipes.com/v1/recipes/231465
request for recipe 231465
New entry in the database: Recipe 231465
https://apps.allrecipes.com/v1/recipes/232288
request for recipe 232288
New entry in the database: Recipe 232288
https://apps.allrecipes.com/v1/recipes/232560
request for recipe 232560
New entry in the database: Recipe 232560
https://apps.allrecipes.com/v1/recipes/232729
request for recipe 232729
New entry in the database: Recipe 232729
https://apps.allrecipes.com/v1/recipes/232920
request for recipe 232920
New entry in the database: Recipe 232920
https://apps.allrecipes.com/v1/recipes/233073
request for recipe 233073
New entry in the databas

In [30]:
collection.count()

16242