# <u>Chapter 7</u>: Summarizing Wikipedia Articles

In [None]:
import sys
import subprocess
import pkg_resources

# Find out which packages are missing.
installed_packages = {dist.key for dist in pkg_resources.working_set}
required_packages = {'scrapy'}
missing_packages = required_packages - installed_packages

# If there are missing packages install them.
if missing_packages:
    print('Installing the following packages: ' + str(missing_packages))
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing_packages], stdout=subprocess.DEVNULL)

## Introducing web scraping

The ``scrapy`` Python framework is an elegant way to implement spiders in Python for large-scale web scraping. In the code that follows, we create the crawler and set the start URL.

In [None]:
import scrapy

# Create a spider for scraping quotes.
class QuotesSpider(scrapy.Spider):
    name = 'quote_spider'
    start_urls = ['http://quotes.toscrape.com']    
    
    # Define its parse method.
    def parse(self, response):
        print(f"Visiting: {response.url}")

        # Parse the info for each quote.
        for quote in response.css("div.quote"):
            text = quote.css("span.text::text").get()
            author = quote.css("small.author::text").get()
            tags = quote.css("div.tags a.tag::text").getall()
            
            print(dict(text=text, author=author, tags=tags))

Next, let's create and start a crawler process using the ``QuotesSpider``.

In [None]:
from scrapy.crawler import CrawlerProcess

# Create a crawler process using the quote spider.
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

# Start the crawling.
crawler = process.create_crawler(QuotesSpider)
process.crawl(crawler)
process.start()

### Machine Learning Techniques for Text 
&copy;2022&ndash;2023, Nikos Tsourakis, <nikos@tsourakis.net>, Packt Publications. All Rights Reserved.