In [25]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json

In [26]:
url='http://allrecipes.com/recipe/11758/'

# REQUEST ----------------------------------------------
r = requests.get(url)
# ------------------------------------------------------
print(r.status_code)
recipeHTML = r.text

200


In [71]:
def parseRecipeHTML(recipeHTML):
    
    out = {}
    
    # General information -------------------------------------------
    recipeSoup = bs(recipeHTML, 'html.parser')
    if (recipeSoup == None):
        print('PARSE ERROR: the input is not an html')
        return None
    
    summary = recipeSoup.find('section', {'class':'recipe-summary'})
    if(summary == None):
        print('PARSE ERROR: the summary is not define')
        return None
    
    title = summary.find('h1', {'class':'recipe-summary__h1'})
    if(title == None):
        print('PARSE ERROR: the title is not define')
        return None
    
    author = summary.find('span', {'itemprop':'author'})
    if(author == None):
        print('PARSE ERROR: the author is not define')
        return None
    
    description = summary.find('div', {'itemprop':'description'})
    if(description == None):
        print('PARSE ERROR: the description is not define')
        return None
    
    out['title'] = title.text
    out['author'] = author.text
    out['description'] = description.text
    
    # Ingredient information ------------------------------------------
    ingredients = []
    ingredient_section = recipeSoup.find('section', {'class':'recipe-ingredients'})
    if(ingredient_section == None):
        print('PARSE ERROR: the ingredient section is not define')
        return None
    
    ingredient_list1 = ingredient_section.find('ul', {'id':'lst_ingredients_1'})
    if(ingredient_list1 == None):
        print('PARSE ERROR: the ingredient list1 is not define')
        return None 
    ingredient_list1_list = ingredient_list1.find_all('li', {'class':'checkList__line'})
    if(ingredient_list1_list == None):
        print('PARSE ERROR: the ingredient_list1 list is not define')
        return None
    for i, ingredient in enumerate(ingredient_list1_list):
        ingredient_text = ingredient.find('span', {'class':'recipe-ingred_txt'})
        if(ingredient_text == None):
            print('PARSE ERROR: ingredient_text {i} list is not define'.format(i=i))
            return None
        if (ingredient_text.text != 'Add all ingredients to list'):
            ingredients.append(ingredient_text.text)
          
    ingredient_list2 = ingredient_section.find('ul', {'id':'lst_ingredients_2'})
    if(ingredient_list2 == None):
        print('PARSE ERROR: the ingredient list2 is not define')
        return None
    ingredient_list2_list = ingredient_list2.find_all('li', {'class':'checkList__line'})
    if(ingredient_list2_list == None):
        print('PARSE ERROR: the ingredient_list2 list is not define')
        return None
    for i, ingredient in enumerate(ingredient_list2_list):
        ingredient_text = ingredient.find('span', {'class':'recipe-ingred_txt'})
        if(ingredient_text == None):
            print('PARSE ERROR: ingredient_text {i} list is not define'.format(i=i))
            return None
        if (ingredient_text.text != 'Add all ingredients to list'):
            ingredients.append(ingredient_text.text)
            
    ingredient_list3 = ingredient_section.find('ul', {'id':'lst_ingredients_3'})
    if(ingredient_list3 != None):
        print('WARNING: ingredients have been ommitted')

    out['ingredients'] = ingredients
    
    # Directives information ----------------------------------------
    prepTimeValue = None
    cookTimeValue = None
    readyInTimeValue = None
    instructionTexts = []
    directives = recipeSoup.find('section', {'class':'recipe-directions'})
    if (directives == None):
        print('PARSE ERROR: the directives are not define')
        return None
    
    directives_div_section = directives.find('div', {'class':'directions--section'})
    if (directives_div_section == None):
        print('PARSE ERROR: the directives_div_section is not define')
        return None
    
    prepTime = directives_div_section.find('ul', {'class':'prepTime'})
    if (prepTime == None):
        print('PARSE ERROR: the prepTime is not define')
        return None
    
    prepTime_items = prepTime.find_all('li', {'class':'prepTime__item'})
    if (prepTime_items == None):
        print('PARSE ERROR: the prepTime_items is not define')
        return None
    
    for (i, item) in enumerate(prepTime_items):
        p = item.find('p', {'class':'prepTime__item--type'})
        time = item.find('time')

        if(p != None):
            if(p.text == 'Prep'):
                prepTimeValue = time.text

            elif(p.text == 'Cook'):
                cookTimeValue = time.text

            elif(p.text == 'Ready In'):
                readyInTimeValue = time.text
        
            else:
                print(p)
                print('WARNING: other type if prepTime item')
            
    out['prepTime'] = prepTimeValue
    out['cookTime'] = cookTimeValue
    out['readyInTime'] = readyInTimeValue
            
            
    directive_instruction = directives_div_section.find('ol', {'itemprop':'recipeInstructions'})
    if (directive_instruction == None):
        print('PARSE ERROR: the directive_instruction is not define')
        return None
    
    directive_instruction_list = directive_instruction.find_all('li', {'class':'step'})
    if (directive_instruction_list == None):
        print('PARSE ERROR: the directive_instruction_list is not define')
        return None
    
    for (i, instruction) in enumerate(directive_instruction_list):
        span = instruction.find('span', {'class':'recipe-directions__list--item'})
        if (span == None):
            print('PARSE ERROR: the span in directive list is not define')
            return None
        instructionTexts.append(span.text)
    
    out['instructions'] = instructionTexts
    
    return out
    

In [72]:
recipe = parseRecipeHTML(recipeHTML)
recipe

{'author': 'Colleen B. Smith',
 'cookTime': '35 m',
 'description': '\r\n"A lady I worked with brought this in one day, and it was a hit. Now it is the favorite of all my dinner guests. It\'s great for a covered dish dinner too. I have made this also without the meat, and it is well received."        ',
 'ingredients': ['1 pound dry ziti pasta',
  '1 onion, chopped',
  '1 pound lean ground beef',
  '2 (26 ounce) jars spaghetti sauce',
  '6 ounces provolone cheese, sliced',
  '1 1/2 cups sour cream',
  '6 ounces mozzarella cheese, shredded',
  '2 tablespoons grated Parmesan cheese'],
 'instructions': ['Bring a large pot of lightly salted water to a boil. Add ziti pasta, and cook until al dente, about 8 minutes; drain.',
  'In a large skillet, brown onion and ground beef over medium heat. Add spaghetti sauce, and simmer 15 minutes.',
  'Preheat the oven to 350 degrees F (175 degrees C). Butter a 9x13 inch baking dish. Layer as follows: 1/2 of the ziti, Provolone cheese, sour cream, 1/2 s

In [75]:
s = requests.Session()
# REQUEST ----------------------------------------------
s.get(url)
# ------------------------------------------------------
token = s.cookies.get('ARToken')
headers = {
    'Origin':'http://allrecipes.com',
    'X-Requested-With':'XMLHttpRequest',
    'Authorization':'Bearer ' + token,
    'Accept':'*/*',
    'Referer':url
}
# REQUEST ----------------------------------------------
#r = requests.get(url, headers=headers)
# ------------------------------------------------------

In [76]:
token

'EAAAADpvLRM06KcgqIhgjcn+tPdhBnmwWG9HT+JbohNsFlXW0DXUmAZ53MMLFT4DNGJp3LyFq1y/u7fEj5cu/sHdX4ZH6XjX5MJkpF7qzf3ktNaiGgswAtW8tluLWYiZTemqr0U563e9I8hYh87BAmkS1Nk='