# Basic Webscraping using Scrapy

This code can not be executed in this environment and is here just as convenient lookup.

In [None]:
# Start a Scrapy Shell Session
scrapy shell -s USER_AGENT='Mozilla/5.0' http://www.nytimes.com

In [None]:
# Extracting the response type
response.status

# Extracting a title
response.css('h1::text').extract()

# Extracting a subtitle
response.css('h2::text').extract()

# Extracting all urls from the links using .css()
response.css('a::attr(href)').extract()

# Extracting all urls from the links using .css()
response.css('a').xpath('@href').extract()

# Extracting all images from the page
response.css('img::attr(src)').extract()

## Subsetting selectors based on attributes

Best to use the .css() selector to subset on attribute and then query with xpath to retrieve information. 

In [None]:
# Selecting all images where the href attribute contains the part 'image'
response.css('a[href*=image]::attr(href)').extract

In [None]:
# Regular Expression method returns unicode strings as result and can not be chained
response.css('a[href*=image]::text').re(r'Name:\s*(.*)')

# Only the first result
response.css('a[href*=image]::text').re_first(r'Name:\s*(.*)')

# 

In [None]:
# Extraction based on class
response.css('.first').xpath('./time/@datetime').extract #Remember to use the '.' to just get child elements

### Using relative XPaths

In order to make the crawler more robust to layout changes in the document, we should always try to use relative XPath starting at an Id or specific attribute level that is likely not to be used for layout purposes. 

Classes therefore tend to not be very robust, as they are often used for css layout.

In [None]:
# Basic Syntax
response.xpath('//div')

In [None]:
# Subsetting routines relative to a specific selector
divs = response.xpath('//div')
for p in divs.xpath('.//a'):
    print(p.extract())
    
# This  again selects all <a> from the whole document
for p in divs.xpath('//a'):
    print(p.extract())

In [None]:
# Using Variable statements to controll the scraping
response.xpath('//div[count(a)=$cnt]/@id', cnt=5).extract_first()

### Scraping with Regular Expression

To conduct more precise selection, it is easy to make use of the whole arsenal of regular Expressions in Scrapy.


In [None]:
# We can use the 'test' statement to probe a specific attribute for its content
response.xpath('//li[re:test(@class, "item-\d$")]//@href').extract()
response.xpath('//a[re:test(@href, "image")]/@href').extract()

### Set operations

These can be used to exclude parts of the document before starting the scrape.

In [None]:
doc = """
 <div itemscope itemtype="http://schema.org/Product">
   <span itemprop="name">Kenmore White 17" Microwave</span>
   <img src="kenmore-microwave-17in.jpg" alt='Kenmore 17" Microwave' />
   <div itemprop="aggregateRating"
     itemscope itemtype="http://schema.org/AggregateRating">
    Rated <span itemprop="ratingValue">3.5</span>/5
    based on <span itemprop="reviewCount">11</span> customer reviews
   </div>

   <div itemprop="offers" itemscope itemtype="http://schema.org/Offer">
     <span itemprop="price">$55.00</span>
     <link itemprop="availability" href="http://schema.org/InStock" />In stock
   </div>

   Product description:
   <span itemprop="description">0.7 cubic feet countertop microwave.
   Has six preset cooking categories and convenience features like
   Add-A-Minute and Child Lock.</span>

   Customer reviews:

   <div itemprop="review" itemscope itemtype="http://schema.org/Review">
     <span itemprop="name">Not a happy camper</span> -
     by <span itemprop="author">Ellie</span>,
     <meta itemprop="datePublished" content="2011-04-01">April 1, 2011
     <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating">
       <meta itemprop="worstRating" content = "1">
       <span itemprop="ratingValue">1</span>/
       <span itemprop="bestRating">5</span>stars
     </div>
     <span itemprop="description">The lamp burned out and now I have to replace
     it. </span>
   </div>

   <div itemprop="review" itemscope itemtype="http://schema.org/Review">
     <span itemprop="name">Value purchase</span> -
     by <span itemprop="author">Lucas</span>,
     <meta itemprop="datePublished" content="2011-03-25">March 25, 2011
     <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating">
       <meta itemprop="worstRating" content = "1"/>
       <span itemprop="ratingValue">4</span>/
       <span itemprop="bestRating">5</span>stars
     </div>
     <span itemprop="description">Great microwave for the price. It is small and
     fits in my apartment.</span>
   </div>
   
 </div>
 """
sel = Selector(text=doc, type='html')
for scope in sel.xpath('//div[@itemscope]'):
    print('current score:', scope.xpath('@itemtype').extract())
    props = scope.xpath("""
        set:difference(./descendant::*/@itemprop,
                       .//*[@itemscope]/*/@itemprop)
    """)