In [1]:
from urllib2 import urlopen
import lxml
from bs4 import BeautifulSoup
import re
from time import sleep
import random
import sys
import json
import gc

In [2]:
BASE_URL = "http://www.delish.com/"

In [3]:
def generate_landingfeed_link(pagenum):
    """
    Each landing page contains a chunk (~60) of recipes. Get new pages by 
    incrementing the page number in the url string 'XXX'  '...special/1/XXX?&id=.....
    """
        
    #concatenate relevent fragments of a url with a pagenumber
    if pagenum <= 71: #only 71 pages exist
        landing_page_link = (BASE_URL + '/landing-feed-special/1/' + str(pagenum) +
                             '?&id=12&template=contenttype&landing=recipes')
        print "Generated link for landing page: %i" %(pagenum)
        return landing_page_link
    else:
        print "Didn\'t get link. There are only 71 landing pages.\n"
        return -1
    

def get_recipe_links(lp_link):
    """
    Gets all the href links on a given landing page.
    """
    try:
        html = urlopen(lp_link).read()
        print "Succesfully opened link: %s\n" %(lp_link)
    except:
        print "%s is not a working link.\n" %(lp_link)
        return -1
    
    soup = BeautifulSoup(html,'lxml')
    #put all links for a given page in a list
    recipe_links = [BASE_URL + a['href'] for a in soup.findAll("a","landing-feed--special-title link link-txt")]
    soup.decompose()
    del html
    gc.collect()
    return recipe_links
    
    #return ####
    

def read_recipe_link(recipe_link):
    """
    Given a link to a recipe on Delish.com, get the name of the recipe and the list of ingredients
    """
    try:
        html = urlopen(recipe_link).read()
    except:
        print "%s is not a working link.\n" %(recipe_link)
        return -1
    
    soup = BeautifulSoup(html,'lxml')
    #this line gets the recipe name from the article-header
    recipe_name = soup.find("header","article-header").h1.string
    #print "Succesfully read recipe name from recipe link: %s "%(recipe_link) #####testing
    soup_objs = soup.find('ul','recipe-list recipe-ingredients-list').findAll('li','recipe-ingredients-item')
    #get the ingredients list. Every ingredient is preceded by an annoying newline character. Remove it
    #before adding to the list. Make sure elem.string returns a string, otherwise skip it.
    ingredients = [elem.string.replace('\n','') for elem in soup_objs if isinstance(elem.string, basestring)]
    #return the name of the recipe and its ingredients in a dictionary object
    soup.decompose()
    """
    del soup_objs
    del html
    gc.collect()
    """
    
    return {recipe_name:ingredients}
    
    #return###
    
def get_all_recipes(page_range):
    """
    Get all the recipes names and ingredients and store them in a dictionary where the keys are the name and
    the ingredients list is the value: {recipe1: [ingr1, ingr2, ....], recipe2:[ingr1, ....]}
    """
    #turns out there are only ever 71 landing pages
    allowed_pagenumbers = set(range(1,72))
    
    if not set(page_range).issubset(allowed_pagenumbers):
        print "Didn\'t get any recipes. Can only look at pages 1 thru 71."
        return
    
    all_recipe_links = []
    
    #concatenate all the recipe links
    for pagenum in page_range:
        landing_page_link =  generate_landingfeed_link(pagenum)
        recipe_links = get_recipe_links(landing_page_link)
        if recipe_links != -1 and recipe_links is not None:
            all_recipe_links.extend(recipe_links)
        else:
            print "Had to terminate scraping because a landing page link was bad.\n"
            return
        sleep(random.uniform(2.0, 4.0)) #add a random amount of deadtime so that the server isn't overloaded
    
    num_recipe_links = len(all_recipe_links)
    recipeName_ingredients = [None]*num_recipe_links #pre allocate size of list
    #add each recipe to the list recipe_ingredients which contains json objects
    #of the form {"recipe name": [list of ingredients]}
    print "There are a total of %i recipe links.\n" %(num_recipe_links)
    for i, rlink in enumerate(all_recipe_links):
        recipe = read_recipe_link(rlink)
        if recipe != -1 or recipe is not None:
            recipeName_ingredients[i] = recipe
            recipe = [] #attempting to get rid of memory leak
        else:
            print "Had to terminate scraping because a recipe link was bad.\n"
            recipe = [] #attempting to get rid of memory leak
            return
        sleep(random.uniform(0.02, .04)) #add a random amount of deadtime so that the server isn't overloaded
        ##print recipe.keys()###for troubleshooting
    gc.collect() #attempting to get rid of memory leak
    return recipeName_ingredients
    #return #### Troubleshooting
        

In [4]:
from collections import defaultdict
from gc import get_objects
before = defaultdict(int)
after = defaultdict(int)
for i in get_objects():
    before[type(i)] += 1

In [5]:
recipes1_3 = get_all_recipes([1,2,3])

Generated link for landing page: 1
Succesfully opened link: http://www.delish.com//landing-feed-special/1/1?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 2
Succesfully opened link: http://www.delish.com//landing-feed-special/1/2?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 3
Succesfully opened link: http://www.delish.com//landing-feed-special/1/3?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [6]:
for i in get_objects():
    after[type(i)] += 1
print [(k, after[k] - before[k]) for k in after if after[k] - before[k]]

[(<class 'zmq.sugar.socket.Socket'>, 1), (<class 'bs4.element.ProcessingInstruction'>, 14), (<class 'tornado.stack_context.NullContext'>, -1), (<class '_ast.ListComp'>, 1), (<type 'list'>, 286370), (<class 'bs4.element.Doctype'>, 180), (<type 'listiterator'>, -1), (<class 'bs4.element.Tag'>, 155006), (<class '_ast.Print'>, 1), (<class 'bs4.element.NavigableString'>, 98086), (<type 'tuple'>, -102), (<class '_ast.Subscript'>, 4), (<class '_ast.Name'>, 5), (<type 'dict'>, 385858), (<class '_ast.Call'>, -2), (<class '_ast.Tuple'>, 1), (<class 'bs4.BeautifulSoup'>, 180), (<type 'instance'>, 9), (<type 'instancemethod'>, -1), (<class 'bs4.element.CharsetMetaAttributeValue'>, 180), (<type 'weakref'>, 6), (<class 'bs4.element.Comment'>, 1260), (<class 'urlparse.SplitResult'>, 3), (<type 'lxml.etree._RotatingErrorLog'>, 1), (<class '_ast.Index'>, 4), (<class '_ast.BinOp'>, 2), (<type 'frame'>, -5), (<class '_ast.comprehension'>, 1)]


In [7]:
before = defaultdict(int)
after = defaultdict(int)
for i in get_objects():
    before[type(i)] += 1

In [8]:
recipes1_3 = get_all_recipes([1,2,3])

Generated link for landing page: 1
Succesfully opened link: http://www.delish.com//landing-feed-special/1/1?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 2
Succesfully opened link: http://www.delish.com//landing-feed-special/1/2?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 3
Succesfully opened link: http://www.delish.com//landing-feed-special/1/3?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [9]:
for i in get_objects():
    after[type(i)] += 1
print [(k, after[k] - before[k]) for k in after if after[k] - before[k]]

[(<class 'bs4.element.ProcessingInstruction'>, 14), (<class '_ast.ListComp'>, 1), (<type 'list'>, 286173), (<class 'bs4.element.Doctype'>, 180), (<type 'listiterator'>, 1), (<class 'bs4.element.Tag'>, 155006), (<class '_ast.Print'>, 1), (<class 'bs4.element.NavigableString'>, 98086), (<type 'tuple'>, 2), (<class '_ast.Subscript'>, 4), (<class '_ast.Name'>, 5), (<type 'dict'>, 385750), (<class '_ast.Call'>, -2), (<class '_ast.Tuple'>, 1), (<class 'bs4.BeautifulSoup'>, 180), (<class 'bs4.element.CharsetMetaAttributeValue'>, 180), (<type 'weakref'>, 2), (<class 'bs4.element.Comment'>, 1260), (<class 'urlparse.SplitResult'>, 3), (<class '_ast.Index'>, 4), (<class '_ast.BinOp'>, 2), (<class '_ast.comprehension'>, 1)]


In [11]:
before = defaultdict(int)
after = defaultdict(int)
for i in get_objects():
    before[type(i)] += 1

In [12]:
recipes1_3 = get_all_recipes([1,2,3])

Generated link for landing page: 1
Succesfully opened link: http://www.delish.com//landing-feed-special/1/1?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 2
Succesfully opened link: http://www.delish.com//landing-feed-special/1/2?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 3
Succesfully opened link: http://www.delish.com//landing-feed-special/1/3?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [13]:
for i in get_objects():
    after[type(i)] += 1
print [(k, after[k] - before[k]) for k in after if after[k] - before[k]]

[(<class '_ast.ListComp'>, 1), (<type 'list'>, -15), (<type 'listiterator'>, 1), (<class 'bs4.element.Tag'>, -8), (<class '_ast.Print'>, 1), (<class 'bs4.element.NavigableString'>, -5), (<type 'tuple'>, 2), (<class '_ast.Subscript'>, 4), (<class '_ast.Name'>, 5), (<type 'dict'>, -5), (<class '_ast.Call'>, -2), (<class '_ast.Tuple'>, 1), (<type 'weakref'>, 2), (<class 'urlparse.SplitResult'>, 3), (<class '_ast.Index'>, 4), (<class '_ast.BinOp'>, 2), (<class '_ast.comprehension'>, 1)]


In [103]:
recipes1_3

[{u'Spaghetti with Turkey Ragu': [u'12 oz. spaghetti',
   u'1  large onion, diced',
   u'2 cloves garlic, minced',
   u'1 large carrot',
   u'1 lb. ground turkey',
   u'kosher salt',
   u'Freshly ground black pepper',
   u'1 c. white wine',
   u'1 28-oz. can crushed tomatoes',
   u'2 tbsp. freshly chopped rosemary',
   u'Freshly chopped parsley, for garnish',
   u'Freshly grated Parmesan, for garnish']},
 {u'Chicken, Bacon, and Spinach Spaghetti': [u'12 oz. spaghetti or angel hair',
   u'1 tbsp. extra-virgin olive oil',
   u'1 lb. boneless skinless chicken breasts',
   u'kosher salt',
   u'Freshly ground black pepper',
   u'6  slices bacon',
   u'2 cloves garlic',
   u'2 c. diced tomatoes (canned or fresh)',
   u'3 c. baby spinach',
   u'1/2 c. heavy cream',
   u'1/3 c. freshly grated Parmesan',
   u'Fresh basil, for garnish']},
 {u'Hot Chocolate Fudge': [u'2  14 oz. cans sweetened condensed milk, divided',
   u'2  packets hot chocolate mix',
   u'2 c. semisweet chocolate chips',
   u'

In [82]:
recipes5 = get_all_recipes([5])

Generated link for landing page: 5
Succesfully opened link: http://www.delish.com//landing-feed-special/1/5?&id=12&template=contenttype&landing=recipes

There are a total of 60 recipe links.



In [13]:
sys.getsizeof(gc.get_objects())

12383864

In [101]:
recipes7_9 = get_all_recipes([7,8,9])

Generated link for landing page: 7
Succesfully opened link: http://www.delish.com//landing-feed-special/1/7?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 8
Succesfully opened link: http://www.delish.com//landing-feed-special/1/8?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 9
Succesfully opened link: http://www.delish.com//landing-feed-special/1/9?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [106]:
recipes7_9

[{u'Farmbar Pork Burgers with Bread-and-Butter Zucchini Pickles': [u'1 lb. ground pork',
   u'1 lb. fresh chorizo',
   u'\xbd c. fresh ricotta cheese',
   u'3 tbsp. capers',
   u'kosher salt',
   u'Pepper',
   u'6 slice sharp Cheddar cheese',
   u'mayonnaise',
   u'ketchup',
   u'6  brioche burger buns',
   u'butter lettuce']},
 {u'Seared Salmon with Lentil Salad': [u'2 tbsp. olive oil',
   u'2 tsp. olive oil',
   u'1  skinless salmon filllet',
   u'kosher salt',
   u'Pepper',
   u'2 tbsp. fresh lemon juice',
   u'2 tsp. Dijon mustard',
   u'2 stalk celery',
   u'1 small seedless cucumber',
   u'\xbd small red onion',
   u'1 can lentils',
   u'\xbd c. chopped fresh flat-leaf parsley or cilantro']},
 {u'Steakhouse Sirloin with Scallion Fries and Salad': [u'4 c. frozen waffle fries',
   u'2 tbsp. chopped scallions',
   u'1 lb. sirloin steak',
   u'\xbd tsp. salt',
   u'\xbd tsp. Pepper',
   u'2 tbsp. butter',
   u'2 tsp. minced garlic',
   u'1 bag baby arugula',
   u'1 c. grape tomatoes'

In [65]:
recipes10_12 = get_all_recipes([10,11,12])

Generated link for landing page: 10
Succesfully opened link: http://www.delish.com//landing-feed-special/1/10?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 11
Succesfully opened link: http://www.delish.com//landing-feed-special/1/11?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 12
Succesfully opened link: http://www.delish.com//landing-feed-special/1/12?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [85]:
recipes13_15 = get_all_recipes([13,14,15])

Generated link for landing page: 13
Succesfully opened link: http://www.delish.com//landing-feed-special/1/13?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 14
Succesfully opened link: http://www.delish.com//landing-feed-special/1/14?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 15
Succesfully opened link: http://www.delish.com//landing-feed-special/1/15?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [23]:
recipes16_18 = get_all_recipes([16,17,18])

Generated link for landing page: 16
Succesfully opened link: http://www.delish.com//landing-feed-special/1/16?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 17
Succesfully opened link: http://www.delish.com//landing-feed-special/1/17?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 18
Succesfully opened link: http://www.delish.com//landing-feed-special/1/18?&id=12&template=contenttype&landing=recipes

There are a total of 180 recipe links.



In [80]:
recipes19_22 = get_all_recipes([19,20,21,22])

Generated link for landing page: 19
Succesfully opened link: http://www.delish.com//landing-feed-special/1/19?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 20
Succesfully opened link: http://www.delish.com//landing-feed-special/1/20?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 21
Succesfully opened link: http://www.delish.com//landing-feed-special/1/21?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 22
Succesfully opened link: http://www.delish.com//landing-feed-special/1/22?&id=12&template=contenttype&landing=recipes

There are a total of 240 recipe links.



In [102]:
recipes23_26 = get_all_recipes(range(23,27))

Generated link for landing page: 23
Succesfully opened link: http://www.delish.com//landing-feed-special/1/23?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 24
Succesfully opened link: http://www.delish.com//landing-feed-special/1/24?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 25
Succesfully opened link: http://www.delish.com//landing-feed-special/1/25?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 26
Succesfully opened link: http://www.delish.com//landing-feed-special/1/26?&id=12&template=contenttype&landing=recipes

There are a total of 240 recipe links.



In [48]:
recipes27_31 = get_all_recipes(range(27,32))

Generated link for landing page: 27
Succesfully opened link: http://www.delish.com//landing-feed-special/1/27?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 28
Succesfully opened link: http://www.delish.com//landing-feed-special/1/28?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 29
Succesfully opened link: http://www.delish.com//landing-feed-special/1/29?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 30
Succesfully opened link: http://www.delish.com//landing-feed-special/1/30?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 31
Succesfully opened link: http://www.delish.com//landing-feed-special/1/31?&id=12&template=contenttype&landing=recipes

There are a total of 300 recipe links.



In [27]:
recipes32_36 = get_all_recipes(range(32,37))

Generated link for landing page: 32
Succesfully opened link: http://www.delish.com//landing-feed-special/1/32?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 33
Succesfully opened link: http://www.delish.com//landing-feed-special/1/33?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 34
Succesfully opened link: http://www.delish.com//landing-feed-special/1/34?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 35
Succesfully opened link: http://www.delish.com//landing-feed-special/1/35?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 36
Succesfully opened link: http://www.delish.com//landing-feed-special/1/36?&id=12&template=contenttype&landing=recipes

There are a total of 300 recipe links.



In [32]:
recipes37_41 = get_all_recipes(range(37,42))

Generated link for landing page: 37
Succesfully opened link: http://www.delish.com//landing-feed-special/1/37?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 38
Succesfully opened link: http://www.delish.com//landing-feed-special/1/38?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 39
Succesfully opened link: http://www.delish.com//landing-feed-special/1/39?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 40
Succesfully opened link: http://www.delish.com//landing-feed-special/1/40?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 41
Succesfully opened link: http://www.delish.com//landing-feed-special/1/41?&id=12&template=contenttype&landing=recipes

There are a total of 300 recipe links.



In [52]:
recipes42_46 = get_all_recipes(range(42,47))

Generated link for landing page: 42
Succesfully opened link: http://www.delish.com//landing-feed-special/1/42?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 43
Succesfully opened link: http://www.delish.com//landing-feed-special/1/43?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 44
Succesfully opened link: http://www.delish.com//landing-feed-special/1/44?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 45
Succesfully opened link: http://www.delish.com//landing-feed-special/1/45?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 46
Succesfully opened link: http://www.delish.com//landing-feed-special/1/46?&id=12&template=contenttype&landing=recipes

There are a total of 300 recipe links.



In [None]:
recipes47_51 = get_all_recipes(range(47,52))

Generated link for landing page: 47
Succesfully opened link: http://www.delish.com//landing-feed-special/1/47?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 48
Succesfully opened link: http://www.delish.com//landing-feed-special/1/48?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 49
Succesfully opened link: http://www.delish.com//landing-feed-special/1/49?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 50
Succesfully opened link: http://www.delish.com//landing-feed-special/1/50?&id=12&template=contenttype&landing=recipes

Generated link for landing page: 51
Succesfully opened link: http://www.delish.com//landing-feed-special/1/51?&id=12&template=contenttype&landing=recipes

There are a total of 300 recipe links.



In [None]:
#Scraping all the pages takes a long time. Best to save the data in a json file
with 