In [None]:
import pandas as pd
import time
import random

from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from scrapy.item import Field, Item
from scrapy.spiders import Spider, CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy import signals

In [None]:
class CultivosDeFlores(Item):
    company_name = Field()
    company_id = Field()
    phone = Field()
    city = Field()
    state = Field()
    company_activity = Field()


class RotateUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self, user_agent=''):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        ua = random.choice(spider.settings.get('USER_AGENT_LIST'))
        if ua:
            request.headers.setdefault('User-Agent', ua)


class CultivosFlores(CrawlSpider):
    name = 'Cultivos'
    custom_settings = {
        'USER_AGENT_LIST': [
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
        ],
        'DOWNLOADER_MIDDLEWARES': {
            '__main__.RotateUserAgentMiddleware': 110,
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
        },
    }

    start_urls = ['https://directorio-empresas.einforma.co/servlet/app/portal/EMP/prod/LISTADO_EMPRESAS/razonsocial/CULTIVO+DE+FLORES', 'https://directorio-empresas.einforma.co/']

    download_delay = 3

    rules = (
        Rule(
            LinkExtractor(
                allow=r'informacion-empresa/'
            ), follow=True, callback="parse_empresa"
        ),
        Rule(
            LinkExtractor(
                allow=r'/?qPg='
            ), follow=True
        ),
    )

    def parse_empresa(self, response):
        sel = Selector(response)
        item = ItemLoader(CultivosDeFlores(), response)
        time.sleep(random.randint(5, 15))

        item.add_xpath('company_name',
                        '//*[@id="content"]/div[1]/div[4]/div/table[1]/tbody/tr[2]/td[2]/text()')
        item.add_xpath('company_id',
                        '//*[@id="content"]/div[1]/div[4]/div/table[2]/tbody/tr[2]/td[2]/a/text()')
        item.add_xpath('phone',
                        '//*[@id="myTelephone"]/text()')
        item.add_xpath('city',
                        '//*[@id="situation_loc"]/text()')
        item.add_xpath('state',
                        '//*[@id="situation_prov"]/text()')
        item.add_xpath('company_activity',
                        '//*[@id="content"]/div[1]/div[4]/div/table[2]/tbody/tr[3]/td[2]/text()')

        yield item.load_item()


process = CrawlerProcess({
    'FEEDS': {
        'Cultivo_De_Flores.csv': {'format': 'csv'},
    },
})

process.crawl(CultivosFlores)
process.start()