# PARTE 1: Extração de Dados

## Configurando Ambiente

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Exibir versão do Python
import platform
platform.python_version()

In [None]:
try: # Checando se Scrapy está instalado
    import scrapy
except:
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

In [None]:
class QuotesSpider(scrapy.Spider):
    name = "quotes"
    # URLs
    start_urls = [
        'http://quotes.toscrape.com/page/1/'
    ]
    
    # Parse da página principal a ser crawleada
    def parse(self, response):
       
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract(),
                'author': quote.css('span small::text').extract(),
                'tags': quote.css('div.tags a.tag::text').extract()
            }

In [None]:
process = CrawlerProcess(get_project_settings())

# Iniciando processo
process.crawl(QuotesSpider)
process.start()

# PARTE 2: Gerando Arquivo de Saída

## Ao executar esse trecho, reinicie o jupyter notebook

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Exibir versão do Python
import platform
platform.python_version()

try: # Checando se Scrapy está instalado
    import scrapy
except:
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

In [None]:
import json

class JsonWriterPipeline(object):

    # Função para gerar/abrir arquivo JSON
    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    # Fechar arquivo após escrita
    def close_spider(self, spider):
        self.file.close()

    # Inserir itens coletados da página WEB no arquivo JSON criado
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [None]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    # URLs
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    
    # Configuração obrigatória de pipeline para geração de arquivo de saída
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, 
        'FEED_FORMAT':'json',                                 
        'FEED_URI': 'quoteresult.json'                        
    }
    
    # Parse da página principal a ser crawleada
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract()[0],
                'author': quote.css('span small::text').extract()[0],
                'tags': quote.css('div.tags a.tag::text').extract()
            }

In [None]:
process = CrawlerProcess(get_project_settings())

# Iniciando processo
process.crawl(QuotesSpider)
process.start()

In [None]:
import pandas as pd
# Carregando JSON criado para visualizar saída
output = pd.read_json('quoteresult.jl', lines=True)
output

## ---------------------------------------------------------------------------------------------------------------------------------

## Outro Exemplo: site SEFAZ

In [None]:
try: # Checando se Scrapy está instalado
    import scrapy
except:
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

In [None]:
import json

class JsonWriterPipeline(object):

    # Função para gerar/abrir arquivo JSON
    def open_spider(self, spider):
        self.file = open('normaresult.jl', 'w')

    # Fechar arquivo após escrita
    def close_spider(self, spider):
        self.file.close()

    # Inserir itens coletados da página WEB no arquivo JSON criado
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [None]:
class SefazItem(scrapy.Item):
    decreto = scrapy.Field()
    publicacao_dou = scrapy.Field()
    ementa = scrapy.Field()
    norma = scrapy.Field()

In [None]:
def get_publicacao_dou(response):
    publicacao_dou = response.css('div.tituloPublicacao ::text')[0].extract().strip()
        
    return publicacao_dou


def get_ementa(response):
    # retrieve the ementa <div>
    ementa_div = response.css('p.ementa::text').extract()

    ementa_list = []
    for i in ementa_div:
        ementa_list.append(i.strip())

    # that ementa_list is broken into some sentences; the function below generates the joint text
    ementa = ' '.join(ementa_list)
    return ementa

def get_norma_elements(response):
    # retrieve the norma <div>
    norma = response.css('div.divSegmentos')

    # declaring the list
    norma_list = []
    for i in norma:
        # retrieve only the text <p> inside the <div>
        tmp = i.css('span::text')[0].extract().strip()
        # add each element into the norma list
        norma_list.append(tmp)

    return norma_list


In [None]:
import logging

class NormaSpider(scrapy.Spider):
    name = 'norma'
    start_urls = [
        'http://normas.receita.fazenda.gov.br/sijut2consulta/link.action?visao=anotado&idAto=95072'
    ]

    # Configuração obrigatória de pipeline para geração de arquivo de saída
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, 
        'FEED_FORMAT':'json',                                 
        'FEED_URI': 'normaresult.json'                        
    }
    
    def parse(self, response):
        # Output file into json of ONE single page
        items = SefazItem()
        items['publicacao_dou'] = get_publicacao_dou(response)
        items['ementa'] = get_ementa(response)
        items['norma'] = get_norma_elements(response)

        yield items


In [None]:
process = CrawlerProcess(get_project_settings())

# Iniciando processo
process.crawl(NormaSpider)
process.start()

In [None]:
import pandas as pd
# Carregando JSON criado para visualizar saída
output = pd.read_json('normaresult.jl', lines=True)
output

In [None]:
 pd.options.display.max_colwidth = 5000

In [None]:
output

## Links Uteis