# Web Scraping Delish Recipes

My goals are to write a scraper that will:

1) Return specific pieces of information (rather than just downloading a whole page)  

2) Iterate over multiple pages/queries  

3) Save the data to my computer

Let's get to it.

In [1]:
# Importing in each cell because of the kernel restarts.
import scrapy
import re
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor

class DelishSpider(scrapy.Spider):
    # Give it a name
    name = 'DES'
    
    # Establish a URL to start with
    start_urls = ['https://www.delish.com/cooking/recipe-ideas']
    
    # Parse out the responses we get
    def parse(self, response):
        
        # Parse the list of recipes
        all_recipes = response.xpath('//div[@class="full-item-content"]')
        
        # Find the links for the each recipe
        for recipe in all_recipes:
            rec_url = self.start_urls[0] + recipe.xpath('.//a/@href').extract_first()
            
            # Yield this URL so we can scrape inside it
            yield scrapy.Request(rec_url, callback=self.parse_recipe)
    
    # This portion scrapes from inside the recipe link
    def parse_recipe(self, response):
        title = response.xpath('//div/h1/text()').extract_first()
        author = response.xpath('//div/a/span[@class="byline-name"]/text()').extract_first()
        intro = response.xpath('//div[@class="recipe-introduction show-more"]/p/text()').extract_first()
        directions = response.xpath('//div[@class="direction-lists"]').extract_first()
        
        # sometimes the author is in a different spot though
        if author is None:
            author = response.xpath('//div/span[@class="byline-name"]/text()').extract_first()
        
        # depending how long the intro is it might say "show less" instead of "more"
        if intro is None:
            intro = response.xpath('//div[@class="recipe-introduction show-less"]/p/text()').extract_first()
        
            # if intro is still empty, this is probably a slideshow page with no directions
            if intro is None:
                intro = response.xpath('//div[@class="slideshow-desktop-dek"]/p/span/text()').extract_first()
                directions = 'slideshow'
        
        yield {
            'title' : title,
            'author' : author,
            'introduction' : intro,
            'directions' : directions
        }
        
        

# Pass the script some scraping etiquette       
process = CrawlerProcess({
    'FEED_FORMAT': 'json',                                # Store data in JSON format
    'FEED_URI': 'delish_data.json',                       # Name our storage file
    'LOG_ENABLED': False,                                 # Turn off logging for now
    'ROBOTSTXT_OBEY': True,                               # Obey the robots.txt rules
    'USER_AGENT': 'GABootcampCrawler (thinkful.com)', # Tell them who you are
    'AUTOTHROTTLE_ENABLED': True,                         # Automatically throttle back request rate
    'HTTPCACHE_ENABLED': True                             # Keep websites we already visited cached
})

# Start the crawler with our spider.
process.crawl(DelishSpider)
process.start()
print('Success!')

Success!
