# Scrapy Demo notebook

In [142]:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.extensions.feedexport import FileFeedStorage
from crochet import setup, wait_for
import json
import logging
import time
import os

setup()

In [143]:
class CustomFileFeedStorage(FileFeedStorage):
    """
    A file feed storage extension that overwrites existing files
    """

    def __init__(self):
        self.logger = logging.getLogger('CustomFileFeedStorage')
        
    def open(self, spider):
        self.logger.info('custom file feed storage', self.path)
        dirname = os.path.dirname(self.path)
        if dirname and not os.path.exists(dirname):
            os.makedirs(dirname)
        return open(self.path, "wb")

In [144]:
class QuotesSpider(scrapy.Spider):
    
    name = "quotes"
    start_urls = [
        "http://quotes.toscrape.com/page/1",
        "http://quotes.toscrape.com/page/2"
    ]
    
    custom_settings = {
        "LOG_LEVEL": logging.INFO,
        "FEED_FORMAT": "json",
        "FEED_URI": "./data/quoteresult.json"        
    }
    
    def __init__(self, *args, **kwargs):
        super(QuotesSpider, self).__init__(*args, **kwargs)
        # This is a hack. Couldnt get the custom file feed storage to work. 
        # FEED_STORAGES needs an entry for file and it has to be fully qualified class name,
        # which doesnt seem to work inside jupyter notebooks
        f = open('./data/quoteresult.json', 'wb')
        f.close()
        
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract()
            }

In [145]:
@wait_for(10)
def run_spider(spider):
    runner = CrawlerRunner()
    d = runner.crawl(spider)
    return d

In [146]:
run_spider(QuotesSpider)

In [147]:
ll data/

total 32
-rw-r--r--  1 nbvasu  staff  5491 Mar 24 21:33 quotejson.pickle
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 22:09 quoteresult.json


In [148]:
!tail -n 2 data/quoteresult.json

{"text": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]}
]

In [149]:
import pandas as pd

df = pd.read_json('./data/quoteresult.json')

In [150]:
df

Unnamed: 0,text,author,tags
0,“This life is what you make it. No matter what...,Marilyn Monroe,"[friends, heartbreak, inspirational, life, lov..."
1,“It takes a great deal of bravery to stand up ...,J.K. Rowling,"[courage, friends]"
2,"“If you can't explain it to a six year old, yo...",Albert Einstein,"[simplicity, understand]"
3,"“You may not be her first, her last, or her on...",Bob Marley,[love]
4,"“I like nonsense, it wakes up the brain cells....",Dr. Seuss,[fantasy]
5,"“I may not have gone where I intended to go, b...",Douglas Adams,"[life, navigation]"
6,"“The opposite of love is not hate, it's indiff...",Elie Wiesel,"[activism, apathy, hate, indifference, inspira..."
7,"“It is not a lack of love, but a lack of frien...",Friedrich Nietzsche,"[friendship, lack-of-friendship, lack-of-love,..."
8,"“Good friends, good books, and a sleepy consci...",Mark Twain,"[books, contentment, friends, friendship, life]"
9,“Life is what happens to us while we are makin...,Allen Saunders,"[fate, life, misattributed-john-lennon, planni..."


In [151]:
df.to_pickle('data/quotejson.pickle')

In [152]:
ll data/

total 32
-rw-r--r--  1 nbvasu  staff  5491 Mar 24 22:09 quotejson.pickle
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 22:09 quoteresult.json


# References

https://docs.scrapy.org/en/latest/intro/overview.html

https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

https://stackoverflow.com/questions/41495052/scrapy-reactor-not-restartable
