# Webscraping with Scrapy

There are many ways to perform webscraping. 

In this Jupyter notebook, I'll use only Scrapy to show what you can do.  

In [1]:
import scrapy

In [2]:
#Let's import the selector function of Scrapy
from scrapy import Selector

In [3]:
# Import the CrawlerProcess to create a spider that goes over the page 
from scrapy.crawler import CrawlerProcess


In [4]:
#for this scrapy spider, I've chosen the courses page of DataCamp to be crawled through
url_short = 'https://www.datacamp.com/courses/all'


In [5]:
# Create the Spider class
class YourSpider(scrapy.Spider):
    name = 'yourspider'
    
    # start_requests method
    def start_requests( self ):
        yield scrapy.Request(url = url_short, callback = self.parse)
      
    def parse(self, response):
    #Now collect the titles and descriptions of the courses
        crs_titles = response.xpath('//h4[contains(@class,"course-block__title")]/text()').extract()
        crs_descrs = response.xpath('//p[contains(@class,"course-block__description")]/text()').extract()
        for crs_title, crs_descr in zip(crs_titles, crs_descrs ):
            dc_dict[crs_title] = crs_descr
    
# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Run the Spider
process = CrawlerProcess()
process.crawl(YourSpider)
process.start()

2020-03-05 15:21:41 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2020-03-05 15:21:41 [scrapy.utils.log] INFO: Versions: lxml 4.3.2.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.7.0, Python 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1c  28 May 2019), cryptography 2.6.1, Platform Windows-10-10.0.18362-SP0
2020-03-05 15:21:41 [scrapy.crawler] INFO: Overridden settings: {}
2020-03-05 15:21:41 [scrapy.extensions.telnet] INFO: Telnet Password: 8898cf62d5fa4612
2020-03-05 15:21:41 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-03-05 15:21:42 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defa

In [6]:
dc_dict

{'Introduction to R': '\n          Master the basics of data analysis by manipulating common data structures such as vectors, matrices, and data frames.\n        ',
 'Intermediate R': '\n          Continue your journey to becoming an R ninja by learning about conditional statements, loops, and vector functions.\n        ',
 'Introduction to Machine Learning': '\n          Learn to train and assess models performing common machine learning tasks such as classification and clustering.\n        ',
 'Cleaning Data in R': '\n          Learn to explore your data so you can properly clean and prepare it for analysis.\n        ',
 'Introduction to Python': '\n          Master the basics of data analysis in Python. Expand your skillset by learning scientific computing with numpy.\n        ',
 'Intermediate R: Practice': '\n          Strengthen your knowledge of the topics you learned in Intermediate R with a ton of new and fun exercises.\n        ',
 'Data Visualization with ggplot2 (Part 1)': 

In [None]:
#Another way to get the same output is:

 Create the Spider class
class DC_Description_Spider(scrapy.Spider):
    name = "dc_chapter_spider"
    
  # start_requests method
    def start_requests(self):
        yield scrapy.Request(url = url_short,
                         callback = self.parse_front)
 
 # First parsing method
    def parse_front(self, response):
        course_blocks = response.css('div.course-block')
        course_links = course_blocks.xpath('./a/@href')
        links_to_follow = course_links.extract()
        for url in links_to_follow:
            yield response.follow(url = url,
                            callback = self.parse_pages)

 # Second parsing method
    def parse_pages(self, response):
    # Create a SelectorList of the course titles text
        crs_title = response.xpath('//h4[contains(@class,"course-block__title")]/text()')
    # Extract the text and strip it clean
        crs_title_ext = crs_title.extract_first().strip()
    # Create a SelectorList of course descriptions text
        crs_descr = response.css( 'p.course-block__description::text' )
    # Extract the text and strip it clean
        crs_descr_ext = crs_descr.extract_first().strip()
    # Fill in the dictionary
        dc_dict[crs_title_ext] = crs_descr_ext
    
# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Run the Spider
process = CrawlerProcess()
process.crawl(DC_Description_Spider)
process.start()
