# Crawling [bezdim.org](https://bezdim.org/signali/reports)

### Import Scrapy and install if missing

In [1]:
try:
    import scrapy
except:
    import sys
    !conda install --yes --prefix {sys.prefix} -c conda-forge scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess

#### Other imports

In [2]:
from time import time
from os.path import join
import pandas as pd

timestamp = int(time())

Currently not using the custom pipeline

In [3]:
import json
import codecs


class JsonWriterPipeline:

    def open_spider(self, spider):
        self.file = codecs.open('reportresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

### Helper functions

In [4]:
class ParseUtils:

    @staticmethod
    def parse_description(raw_description):
        striped = [d.strip() for d in raw_description if d.strip()]
        return " ".join(striped)
    
    @staticmethod
    def parse_time_location(when_where_element):
        selectors = when_where_element.xpath('.//span')
        if len(selectors) < 1:
            return ('N/A Data', 'N/A Location')
        
        date = selectors[0].css('span.r_date::text').extract_first(default='N/A Date')
        location = 'N/A Location'
        if len(selectors) > 1:
            location = selectors[1].css('span.r_location::text').extract_first(default='N/A Location')
    
        return date, location
    
    @staticmethod
    def parse_categories(categories_element):
        '''
        Can have multiple categories
        https://bezdim.org/signali/reports/view/10099
        '''
        selector = categories_element.xpath('.//p//a')
        categories = selector.css('a::text').extract()
        
        return [cat.strip() for cat in categories]
    
    @staticmethod
    def parse_file_urls(content_element):
        selector = content_element.xpath('.//ul//li')
        
        files = []
        
        for tag in selector.css('a'):
            url = tag.xpath('@href').extract_first(default='N/A url')
            name = tag.css('::text').extract_first(default='N/A filename')
            files.append({'url':url, 'name':name})
        
        return files

In [5]:
import logging
from scrapy import Spider
from scrapy.http import Request

class BezDimSpider(Spider):
    name = "bezdim"
    start_page = 1
    page_count = 100
    start_urls = [
        'https://bezdim.org/signali/reports'
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
#         'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1},
        'FEED_FORMAT':'json',
        'FEED_URI': join('reports_data','report-result-{timestamp}.json'.format(timestamp=timestamp))
    }
    
    def start_requests(self):
        for i in range(self.start_page, self.start_page + self.page_count):
            yield Request('{url}/fetch_reports?page={page_id}'.
                          format(url=self.start_urls[0], page_id=i),
                    callback=self.process_page)
    
    def process_page(self, response):
        for link in response.css('a.r_title'):
            url = link.xpath('@href').extract_first()
            yield Request(url, callback=self.parse)
    
    def parse(self, report):
        report_id = int(report.url.split('/')[-1])
        title = report.css('h1.report-title::text').extract_first(default='').strip()
        
        raw_desc = report.css('div.report-description-text::text').extract()
        description = ParseUtils.parse_description(raw_desc)
        
        date, location = ParseUtils.parse_time_location(report.css('p.report-when-where'))
        
        categories = ParseUtils.parse_categories(report.css('div.report-category-list'))
        
        files = ParseUtils.parse_file_urls(report.css('div.content'))
        
        yield {
            'id': report_id,
            'title': title,
            'description': description,
            'date': date,
            'location': location,
            'categories': categories,
            'files': files
        }
        
#             description = report.css('div.r_description::text').extract_first().strip()
#             address = report.css('p.bd_location::text').extract_first().strip()
            
            

In [6]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(BezDimSpider)
process.start()

2019-02-04 22:38:47 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2019-02-04 22:38:47 [scrapy.utils.log] INFO: Versions: lxml 4.3.0.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 3.6.6 | packaged by conda-forge | (default, Oct 12 2018, 07:24:56) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p  14 Aug 2018), cryptography 2.3.1, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-02-04 22:38:47 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'reports_data/report-result-1549312727.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


In [7]:
import pandas as pd

df = pd.read_json(
    join('reports_data','report-result-{timestamp}.json'.format(timestamp=timestamp)))

In [8]:
df

Unnamed: 0,categories,date,description,files,id,location,title
0,[заведение за хранене и развлечение],09:30 Mar 30 2018,"В сладкарница Малинка, пушат навсякъде! * Сигн...",[{'url': 'https://bezdim.org/signali/media/upl...,10244,"град София, кв. Младост, сп. Окръжна болница, ...","Пушене в сладкарница Малинка, град София"
1,[заведение за хранене и развлечение],02:13 Mar 31 2018,В Глори бар и грил се пуши дори през деня. * С...,[{'url': 'https://bezdim.org/signali/media/upl...,10249,"град София, ул. Пирински проход 24 А","Неспазване на забраната за тютюнопушене, град ..."
2,[заведение за хранене и развлечение],05:00 Jul 6 2018,име обект: Mr. Pizza вид обект: Ресторант град...,[{'url': 'https://bezdim.org/signali/media/upl...,10355,"град София , бул. ""Черни връх"" № 100","Пушене в ресторант ""Mr. Pizza"", град София (м)"
3,[заведение за хранене и развлечение],01:50 Dec 18 2018,име обект: Галерия вид обект: рсторант град: К...,[{'url': 'https://bezdim.org/signali/media/upl...,10471,"град Кюстендил, ул. ""Д. Димитрий"" 9","Пушене в рсторант ""Галерия"", град Кюстендил (м)"
4,[заведение за хранене и развлечение],09:23 Dec 17 2018,"Понеделник вечерта, седнах да вечерям в SARAY ...",[{'url': 'https://bezdim.org/signali/media/upl...,10469,"град София, Дружба 2, ул. Копенхаген № 17","Пушене в заведение SARAY turkish restaurant, г..."
5,[заведение за хранене и развлечение],00:43 Dec 19 2018,име обект: Casita food & wine вид обект: Ресто...,[{'url': 'https://bezdim.org/signali/media/upl...,10472,"град София, ул. Доспат 66","Пушене в ресторант ""Casita food & wine"", град ..."
6,[заведение за хранене и развлечение],08:44 Dec 20 2018,"В кафе бар ""Табакера"" се пуши всяка вечер,вътр...",[{'url': 'https://bezdim.org/signali/media/upl...,10474,"град София, ж.к. Люлин, ул. Тодор Влайков 27","Пуши се в кафе бар ""Табакера"", град София"
7,[заведение за хранене и развлечение],00:40 Dec 20 2018,Тютюнопушене в заведението на обяд??!! * Сигна...,[{'url': 'https://bezdim.org/signali/media/upl...,10473,"град София, ул. Алабин 35","Пушене в ресторант ""Етно"", град София"
8,[заведение за хранене и развлечение],08:45 May 3 2018,име обект: На улицата вид обект: Ресторант гра...,[{'url': 'https://bezdim.org/signali/media/upl...,10319,"град София, бул. Алвксандър Пушкин 38","Пушене в ресторант ""На улицата"", град София (м)"
9,[заведение за хранене и развлечение],02:30 May 5 2018,От самото отваряне на клуба преди около 6 месе...,[{'url': 'https://bezdim.org/signali/media/upl...,10322,"Искра 12, град Казанлък","Редовни нарушения в клуб ""Noir"", град Казанлък"
