In [1]:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
import json
import logging

In [2]:
class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        "http://quotes.toscrape.com/page/1",
        "http://quotes.toscrape.com/page/2"
    ]
    
    custom_settings = {
        "LOG_LEVEL": logging.WARNING,
        "FEED_FORMAT": "json",
        "FEED_URI": "./data/quoteresult.json"
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract()
            }

In [3]:
def run_spider(spider):
    configure_logging()
    runner = CrawlerRunner()
    runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()

In [4]:
run_spider(QuotesSpider)

2020-03-24 18:46:19 [scrapy.crawler] INFO: Overridden settings:
{'FEED_FORMAT': 'json', 'FEED_URI': './data/quoteresult.json', 'LOG_LEVEL': 30}


In [5]:
ll data/

total 16
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 18:46 quoteresult.json


In [6]:
!tail -n 2 data/quoteresult.json

{"text": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]}
]

In [7]:
import pandas as pd

df = pd.read_json('data/quoteresult.json')

In [8]:
df

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
2,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
3,"“You may not be her first, her last, or her on...",Bob Marley,[love]
4,"“I like nonsense, it wakes up the brain cells....",Dr. Seuss,[fantasy]
5,"“I may not have gone where I intended to go, b...",Douglas Adams,"[life, navigation]"
6,"“The opposite of love is not hate, it's indiff...",Elie Wiesel,"[activism, apathy, hate, indifference, inspira..."
7,"“It is not a lack of love, but a lack of frien...",Friedrich Nietzsche,"[friendship, lack-of-friendship, lack-of-love,..."
8,"“Good friends, good books, and a sleepy consci...",Mark Twain,"[books, contentment, friends, friendship, life]"
9,“Life is what happens to us while we are makin...,Allen Saunders,"[fate, life, misattributed-john-lennon, planni..."


In [9]:
df.to_pickle('data/quotejson.pickle')

In [10]:
ll data/

total 32
-rw-r--r--  1 nbvasu  staff  5491 Mar 24 18:46 quotejson.pickle
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 18:46 quoteresult.json
