In [None]:
import os # To get the path of this file
import scrapy # Library for the crawler
import json # To output our data in JSON format
import logging
import threading # For locks

# Scrapy specific imports
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

# Settings for notebook & scrapy
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Make a lock for counter value
lock = threading.RLock()

In [None]:
class JsonWriterPipeline(object):
    """
    This pipeline will output all data in one JSON file, line separated.
    """

    def open_spider(self, spider):
        """
        When starting this pipeline, the file needs to be openend.
        """
        self.file = open('euresult.jl', 'w')

    def close_spider(self, spider):
        """
        When the crawler is done, the file needs to be closed.
        """
        self.file.close()

    def process_item(self, item, spider):
        """
        Every item that is processed is written to a new line in the file.
        """
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
    

class JsonSeperateFileWriterPipeline(object):
    """
    This pipeline will output all data in seperate JSON files, using the ID as file name, storing in the directory 'files'.
    """
    
    def open_spider(self, spider):
        """
        We do not have to do anything when starting this pipeline
        """
        print("Starting JsonSeperateFileWriterPipeline")
        
    def close_spider(self, spider):
        """
        We print the statistics for this run when the spider quits
        """
        print("Finished! We processed " + str(spider.item_counter) + " files.")
        print("Closing JsonSeperateFileWriterPipeline")

    def process_item(self, item, spider):
        """
        Every item that is processed is written to a seperate json file.
        """
        # TODO: check if this approach works on Windows
        tmp = open(os.path.abspath(os.curdir) + '/files/{}.json'.format(item['local_id'].replace('/', '')), 'w')
        line = json.dumps(dict(item)) + "\n"
        
        tmp.write(line)
        tmp.close()
        return item

In [None]:
class EucybsecSpider(CrawlSpider):
    """
    The spider designed for the European Union CELEX website.
    """
    
    name = 'eucybsec'
    allowed_domains = ['eur-lex.europa.eu'] # URLs outside these domains are not followed
    start_urls = [ # TODO: Soft-code search terms
        'https://eur-lex.europa.eu/search.html?text=cyber%20security&scope=EURLEX&type=quick&lang=en'
    ]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {
#             '__main__.JsonWriterPipeline': 1,
            '__main__.JsonSeperateFileWriterPipeline': 1,
        }, 
    }
    
    rules = (
        Rule(
            LinkExtractor(
                allow=(),
                restrict_xpaths=('(//a[@title="Next Page"])[1]',), # This will extract the URL for the next page, if existent
            ),
            callback='parse_item',
            follow=True,
        ),
    )
    
    item_counter = 0

    def parse_document_page(self, response):
        """
        Yield the values for one specific document
        """
        
        # The neatest way to get the document date, but not always available
        document_date = response.xpath('//meta[@property="eli:date_document"]/@content').extract_first()
        
        # The other way to get the document date.
        if not document_date:
            document_date = response.xpath('//p[@class="hd-date"]/text()').extract_first()
            if document_date:
                document_date = document_date.strip()    
        
        # Compress the content for efficiency
        # TODO: See #5
        content_elements = response.xpath('//div[@class="tabContent"]//text()').extract()
        content = ' '.join(content_elements)
        content = ' '.join(content.split()).lower()
        
        with lock:
            self.item_counter += 1
            this_item = self.item_counter
        
        yield {
            'title': response.xpath("//p[@id='translatedTitle']/text()").extract_first(),
            'local_id': response.xpath('//*[contains(@class,"DocumentTitle")]/text()').extract_first().split()[1],
            'order_id': this_item,
            'document_type': response.xpath('//meta[@property="eli:type_document"]/@resource').extract_first(),
            'publication_date': response.xpath('//meta[@property="eli:date_publication"]/@content').extract_first(),
            'document_date': document_date,
            'direct_url': response.url,
            'content': content,
        }

    def parse_item(self, response):
        """
        Parsing a page with search results, calling for the next page in the end.
        """
        
        print('Processing.. ' + response.url)
        item_links = response.xpath("//div[contains(@class,'SearchResult')]/h2/a/@href").extract()

        for i in range(len(item_links)):
            item_links[i] = "https://eur-lex.europa.eu" + item_links[i][1:]

        for a in item_links:
            yield scrapy.Request(a, callback=self.parse_document_page)
            
    
    def parse_start_url(self, response):
        return self.parse_item(response)
            


In [None]:
def run_crawler():
    """
    Start the crawler
    """
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })

    process.crawl(EucybsecSpider)
    process.start()
    
# TODO: Figure out how to 'restart' without having to fully restart the kernel.
run_crawler()