In [None]:
import os # To get the path of this file
import scrapy # Library for the crawler
from inline_requests import inline_requests # Addition to scrapy to make certain inline requests possible, used for PDFs
import json # To output our data in JSON format
import logging
import threading # For locks
import PyPDF2 # To parse PDF files when a HTML version is not available

# Scrapy specific imports
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request

# Settings for notebook & scrapy
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Make a lock for counter value
lock = threading.RLock()

In [None]:
DRY_RUN = False # If set to true, no files are actually written

SEARCH_KEYS = [ # The search terms we are using to retrieve information
    'cyber%20security',
#     'cybersecurity',
#     'cybercrime',
]

In [None]:
class JsonWriterPipeline(object):
    """
    This pipeline will output all data in one JSON file, line separated.
    """

    def open_spider(self, spider):
        """
        When starting this pipeline, the file needs to be openend.
        """
        self.file = open('euresult.jl', 'w')

    def close_spider(self, spider):
        """
        When the crawler is done, the file needs to be closed.
        """
        self.file.close()

    def process_item(self, item, spider):
        """
        Every item that is processed is written to a new line in the file.
        """
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
    

class JsonSeperateFileWriterPipeline(object):
    """
    This pipeline will output all data in seperate JSON files, using the ID as file name, storing in the directory 'files'.
    """
    
    def open_spider(self, spider):
        """
        We do not have to do anything when starting this pipeline
        """
        logging.info("Starting JsonSeperateFileWriterPipeline")
        
    def close_spider(self, spider):
        """
        We print the statistics for this run when the spider quits
        """
        logging.info("Finished! We processed " + str(spider.item_counter) + " files.")
        logging.info("Closing JsonSeperateFileWriterPipeline")

    def process_item(self, item, spider):
        """
        Every item that is processed is written to a seperate json file.
        """
        # TODO: check if this approach works on Windows
        
        if not DRY_RUN:
            with open(os.path.abspath(os.curdir) + '/files/{}.json'.format(item['local_id'].replace('/', '')), 'w') as file:
                line = json.dumps(dict(item)) + "\n"
                file.write(line)
                
        
        for key in item:
            if not item.get(key) or item.get(key) == "":
                logging.info("'" + key + "' is None/empty for " + item['direct_url'])
            
        return item

In [None]:
class EucybsecSpider(CrawlSpider):
    """
    The spider designed for the European Union CELEX website.
    """
    
    name = 'eucybsec'
    allowed_domains = ['eur-lex.europa.eu'] # URLs outside these domains are not followed
    start_urls = [ # TODO: Soft-code search terms
        'https://eur-lex.europa.eu/search.html?text=cyber%20security&scope=EURLEX&type=quick&lang=en'
    ]
    
    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {
#             '__main__.JsonWriterPipeline': 1,
            '__main__.JsonSeperateFileWriterPipeline': 1,
        }, 
        'RETRY_HTTP_CODES': [500, 503, 504, 400, 408, 404],
    }
    
    rules = (
        Rule(
            LinkExtractor(
                allow=(),
                restrict_xpaths=('(//a[@title="Next Page"])[1]',), # This will extract the URL for the next page, if existent
            ),
            callback='parse_item',
            follow=True,
        ),
    )
    
    # Non scrapy variables
    item_counter = 0
    MAIN_DOMAIN = "https://eur-lex.europa.eu"

    @inline_requests
    def parse_document_page(self, response):
        """
        Yield the values for one specific document
        """
        
        # Get Document information page
        di_rel_path = response.xpath('//a[contains(text(),"Document information")]/@href').extract_first()
        doc_info = yield Request(response.urljoin(di_rel_path))
        
        # The neatest way to get the document date, but not always available
        document_date = response.xpath('//meta[@property="eli:date_document"]/@content').extract_first()
        
        # The other way to get the document date
        if not document_date:
            document_date = doc_info.xpath('//dt[contains(text(), "Date of document")]/following::dd[1]/text()').extract_first()
            if document_date:
                document_date = document_date.replace(';','')
        
        # The neatest way to get the publication date, but not always available
        publication_date = response.xpath('//meta[@property="eli:date_publication"]/@content').extract_first()
        
        # The other way to get the publication date.
        if not publication_date:
            publication_date = response.xpath('//p[@class="hd-date"]/text()').extract_first()
            if publication_date:
                publication_date = publication_date.strip()
                
        # Turns out, if the publication date is still known, it is the same as the document date!
        if not publication_date:
            publication_date = document_date
        
        # Compress the content for efficiency
        # TODO: See #5
        content_elements = response.xpath('//div[@class="tabContent"]//text()').extract()
        content = ' '.join(content_elements)
        
        eurovoc_descriptors = doc_info.xpath('//dd[preceding-sibling::dt[contains(text(), "EUROVOC")]][1]/ul//span/text()').extract()
        subject_matters = doc_info.xpath('//dt[contains(text(), "Subject matter")]/following::dd[1]//span/text()').extract()
        author = doc_info.xpath('//dt[contains(text(), "Author")]/following::dd[1]/*/text()').extract_first()
        
        # Document type, two ways
        document_type = response.xpath('//meta[@property="eli:type_document"]/@resource').extract_first()
        if not document_type:
            document_type = doc_info.xpath('//dt[contains(text(), "Form")]/following::dd[1]/span/text()').extract_first()
        
        #### Content extraction
        
        # The HTML version is not available, so we are going to extract the PDF
        if content == "":
            
            # Get the URL for the PDF file or stream page
            pdf_href = response.xpath("//a[@id='format_language_table_PDF_EN']/@href").extract_first()
            
            # Check whether this page even exist
            if pdf_href:
                                
                # Get the PDF page
                pdf_href = self.MAIN_DOMAIN + pdf_href.split("..")[-1]
                pdf_response = yield Request(pdf_href)
                
                # Check if URL is actually a PDF file
                content_type = pdf_response.headers.get('content-type').decode("utf-8")
                if not 'application/pdf' in content_type:
                    
                    # If not, there are probably multiple URLs here!
                    streams = pdf_response.xpath("//ul[@class='multiStreams']//@href").extract()
                    abs_streams = [self.MAIN_DOMAIN + rel_path.split("..")[-1] for rel_path in streams]
                    
                    # Extract every PDF seperately
                    for stream in abs_streams:
                        stream_rsp = yield Request(stream)
                        content += self.extract_pdf(stream_rsp)
                else:
                    content = self.extract_pdf(pdf_response)
            else:
                logging.warning("No URL found! On url: " + response.url)
        
        # Minimize the content for efficiency
        content = ' '.join(content.split()).lower()
        
        #### End of content extraction
        
        with lock:
            self.item_counter += 1
            this_item = self.item_counter
        
        yield {
            'title': response.xpath("//p[@id='translatedTitle']/text()").extract_first(),
            'local_id': response.xpath('//*[contains(@class,"DocumentTitle")]/text()').extract_first().split()[1],
            'order_id': this_item,
            'document_type': document_type,
            'publication_date': publication_date,
            'document_date': document_date,
            'direct_url': response.url,
            'content': content,
            'eurovoc_descriptors': eurovoc_descriptors,
            'subject_matters': subject_matters,
            'author': author,
        }

    def parse_item(self, response):
        """
        Parsing a page with search results, calling for the next page in the end.
        """
        
        item_links = response.xpath("//div[contains(@class,'SearchResult')]/h2/a/@href").extract()
            
        item_links = [self.MAIN_DOMAIN + rel_path[1:] for rel_path in item_links]

        for a in item_links:
            yield scrapy.Request(a, callback=self.parse_document_page)
            
    
    def parse_start_url(self, response):
        """
        Parse the start_urls just as a regular page.
        """
        return self.parse_item(response)
    
    
    def extract_pdf(self, response):
        
        # Temporary store the PDF file locally
        with open("tmp.pdf", 'wb') as my_data:
            my_data.write(response.body)

        open_pdf_file = open("tmp.pdf", 'rb')
        
        content = ""
        
        try:
            pdfReader = PyPDF2.PdfFileReader(open_pdf_file)
            
            # append all the pages to the content
            for page_number in range(pdfReader.numPages):
                content += pdfReader.getPage(page_number).extractText()
        
        except Exception as e:
            logging.warning("Could not read PDF on: " + response.url +  ", Error:")
            logging.warning(e)
            return ""
          
        return content
            


In [None]:
def run_crawler():
    """
    Start the crawler
    """
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })

    process.crawl(EucybsecSpider)
    process.start()
    
# TODO: Figure out how to 'restart' without having to fully restart the kernel.
run_crawler()