# Scrapy - Redis Example

## Introduction

### Redis is an open source (BSD licensed), in-memory data structure store, used as a database, cache, and message broker

In [17]:
# https://redis.io/ 
# https://try.redis.io/

# "An operation during which a processor can simultaneously 
# read a location and write it in the same bus operation.
# This prevents any other processor or I/O device from writing 
# or reading memory until the operation is complete"

In [1]:
import os
import redis
import json

# For Redis set up notes see : https://www.ubuntupit.com/how-to-install-and-configure-redis-on-linux-system/

In [2]:
#RC = Client(host=os.environ.get('REDIS_HOST'),password='M0F0_741!', charset='utf-8', decode_responses=True)

RC = redis.Redis(host=os.environ.get('REDIS_HOST'), charset='utf-8', decode_responses=True)

In [3]:
RC # Connection object

Redis<ConnectionPool<Connection<host=None,port=6379,db=0>>>

In [4]:
RC.set("foo", 2)

True

In [5]:
RC.get("foo")

'2'

In [6]:
RC.mset({"Croatia": "Zagreb", "Bahamas": "Nassau"})

True

In [7]:
RC.get("Bahamas")

'Nassau'

In [8]:
mydict = { 'var1' : 5, 'var2' : 9, 'var3': [1, 5, 9] }
rval = json.dumps(mydict)
RC.set('key_1', rval)

True

In [9]:
data = RC.get('key_1')
print(data)

{"var1": 5, "var2": 9, "var3": [1, 5, 9]}


In [10]:
dc = json.loads(data)
print(type(dc))

<class 'dict'>


In [29]:
# check if we can use dict as dataframe as-is
for k,v in dc.items():
    print(v)
    assert len(str(v))==1, "value too long - so we can't make df with this"

5
9
[1, 5, 9]


AssertionError: value too long - so we can't make df with this

In [30]:
# https://pypi.org/project/scrapy-redis/
# You can start multiple spider instances that share a single redis queue. 
# Best suitable for broad multi-domain crawls.

In [31]:
#!pip install scrapy-redis
from scrapy_redis.spiders import RedisSpider

class MySpider(RedisSpider):
    name = 'myspider'

    def parse(self, response):
        # do stuff
        pass

## Feeding a Spider from Redis

##### The class scrapy_redis.spiders.RedisSpider enables a spider to read the urls from redis. 


In [32]:
# The urls in the redis queue will be processed one after another, 
# if the first request yields more requests, 
# the spider will process those requests before fetching another url from redis.

In [33]:
# https://pypi.org/project/scrapy-redis/

In [11]:
!echo -e 'lpush myspider:start_urls http://google.com https://msn.com http://books.toscrape.com' | redis-cli 

(integer) 3


In [None]:
!scrapy runspider myspider.py

2021-06-09 15:57:34 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-06-09 15:57:34 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.8 (default, Apr 13 2021, 19:58:26) - [GCC 7.3.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.4.7, Platform Linux-5.4.0-73-generic-x86_64-with-glibc2.10
2021-06-09 15:57:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-06-09 15:57:34 [scrapy.crawler] INFO: Overridden settings:
{'SPIDER_LOADER_WARN_ONLY': True}
2021-06-09 15:57:34 [scrapy.extensions.telnet] INFO: Telnet Password: 2bb08a8270fd9a93
2021-06-09 15:57:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2021-06-09 15:57:34 [myspider] INFO: Reading start URLs from redi