# Scrapy Demo notebook

In [51]:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.settings import Settings
from crochet import setup, wait_for

import json
import logging
import time
import os

import source.scrapy.storages


setup()

In [53]:
settings = Settings({
    "FEED_STORAGE_BASE": {
        "file": "source.scrapy.storages.CustomFileFeedStorage"
    } 
})

In [54]:
class QuotesSpider(scrapy.Spider):
    
    name = "quotes"
    start_urls = [
        "http://quotes.toscrape.com/page/1",
        "http://quotes.toscrape.com/page/2"
    ]
    
    custom_settings = {
        "LOG_LEVEL": logging.INFO,
        "FEED_FORMAT": "json",
        "FEED_URI": "./data/quoteresult.json"
    }
    
    def __init__(self, *args, **kwargs):
        super(QuotesSpider, self).__init__(*args, **kwargs)
        # This is a hack. Couldnt get the custom file feed storage to work. 
        # FEED_STORAGES setting doesnt seem to work. It doesnt register
        f = open('./data/quoteresult.json', 'wb')
        f.close()
        
    def parse(self, response):
        self.logger.info('Parse function called on %s', response.url)
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract()
            }

In [55]:
@wait_for(10)
def run_spider(spider):
    runner = CrawlerRunner(settings)
    d = runner.crawl(spider)
    return d

In [56]:
run_spider(QuotesSpider)

2020-03-24 22:50:49 [scrapy.crawler] INFO: Overridden settings:
{'FEED_FORMAT': 'json', 'FEED_URI': './data/quoteresult.json', 'LOG_LEVEL': 20}
2020-03-24 22:50:49 [scrapy.extensions.telnet] INFO: Telnet Password: 82102a540c8c898f
2020-03-24 22:50:49 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2020-03-24 22:50:49 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloaderm

In [57]:
ll data/

total 48
-rw-r--r--  1 nbvasu  staff  5491 Mar 24 22:09 quotejson.pickle
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 22:50 quoteresult.json
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 22:42 quoteresult_test.json


In [58]:
!tail -n 2 data/quoteresult.json

{"text": "\u201cLife is what happens to us while we are making other plans.\u201d", "author": "Allen Saunders", "tags": ["fate", "life", "misattributed-john-lennon", "planning", "plans"]}
]

In [59]:
import pandas as pd

df = pd.read_json('./data/quoteresult.json')

In [60]:
df

Unnamed: 0,text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
5,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
6,“It is better to be hated for what you are tha...,André Gide,"[life, love]"
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison,"[edison, failure, inspirational, paraphrased]"
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt,[misattributed-eleanor-roosevelt]
9,"“A day without sunshine is like, you know, nig...",Steve Martin,"[humor, obvious, simile]"


In [61]:
df.to_pickle('data/quotejson.pickle')

In [62]:
ll data/

total 48
-rw-r--r--  1 nbvasu  staff  5491 Mar 24 22:50 quotejson.pickle
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 22:50 quoteresult.json
-rw-r--r--  1 nbvasu  staff  5573 Mar 24 22:42 quoteresult_test.json


# References

https://docs.scrapy.org/en/latest/intro/overview.html

https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

https://stackoverflow.com/questions/41495052/scrapy-reactor-not-restartable
