In [2]:
import urllib3
import re
from bs4 import BeautifulSoup
import os
import justext

In [3]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
    
    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("English"))
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    output_file.write(paragraph.text)
        
        # get links
        soup = BeautifulSoup(response.data, 'html.parser')
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]
        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [4]:
crawler_tecnologia = Crawler("../Data/corpora/tecnologia", 50, "https://tecnoblog.net/281950/", "^https://tecnoblog\.net/\d+")
crawler_politica = Crawler("../Data/corpora/politica", 50, "http://blogs.opovo.com.br/politica/",
                           "^http://blogs\.opovo\.com\.br/politica/\d+")

In [5]:
crawler_politica.crawl()

getting page http://blogs.opovo.com.br/politica/
getting page http://blogs.opovo.com.br/politica/2019/04/15/bolsonaro-concede-passaporte-diplomatico-para-bispo-edir-macedo-e-esposa/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/18/proposta-popular-no-senado-quer-criminalizar-coach-no-brasil/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/17/em-meio-a-tumulto-na-ccj-campainha-para-de-funcionar-deputado-reage-senta-que-o-boi-e-manso/
getting page http://blogs.opovo.com.br/politica/2019/04/15/deputados-do-ce-expedem-36-passaportes-diplomaticos-incluindo-esposa-e-filhos/
getting page http://blogs.opovo.com.br/politica/2019/04/17/em-meio-a-tumulto-na-ccj-campainha-para-de-funcionar-deputado-reage-senta-que-o-boi-e-manso/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/16/bolsonaro-lula-e-dilma-deram-passaportes-diplomaticos-a-edir-macedo-e-pelas-mesmas-razoes/#respond
getting page http://blogs.opovo.com.br/politica/2019/04/16/em-100-dias-ce

In [12]:
crawler_hardware = Crawler(os.path.join(os.pardir, "Data/corpora/hardware"), 50, "https://www.techspot.com/reviews/", "^https://www.techspot\.com/review/\d+")

In [7]:
crawler_hardware.crawl()

getting page https://www.techspot.com/reviews/
getting page https://www.techspot.com/review/1784-resident-evil-2-benchmarks/
getting page https://www.techspot.com/review/1796-best-rtx-2060-graphics-cards/
getting page https://www.techspot.com/review/1820-alienware-m15/
getting page https://www.techspot.com/review/1807-asus-rog-strix-scar2-rtx-laptop/
getting page https://www.techspot.com/review/1791-amd-radeon-vii-mega-benchmark/
getting page https://www.techspot.com/review/1798-gigabyte-aero-15-x9/
getting page https://www.techspot.com/review/1808-geforce-gtx-1660-ti-vs-rtx-2060-vs-gtx-980-ti/
getting page https://www.techspot.com/review/1831-ray-tracing-geforce-gtx-benchmarks/
getting page https://www.techspot.com/review/1829-intel-core-i5-9400f-vs-amd-ryzen-5-2600x/
getting page https://www.techspot.com/review/1816-asrock-deskmini-a300/
getting page https://www.techspot.com/review/1818-sekiro-shadows-die-twice/
getting page https://www.techspot.com/review/1825-ryzen-2600x-vs-1700/
g

In [16]:
crawler_mobile = Crawler(os.path.join(os.pardir, "Data/corpora/mobile_review"), 50, "https://www.theverge.com/phone-review", "^https://www\.theverge\.com/\d+")

In [17]:
crawler_mobile.crawl()

getting page https://www.theverge.com/phone-review
getting page https://www.theverge.com/2019/4/9/18301982/lg-g8-thinq-review-android-phone-snapdragon-855-6-gb-ram-z-camera-sensor
getting page https://www.theverge.com/2019/4/16/18308523/kickstater-indiegogo-crowdfund-gadget-never-shipped
getting page https://www.theverge.com/2018/10/22/18001100/razer-phone-2-review-gaming-chroma-features-light-up-price-specs#comments
getting page https://www.theverge.com/2018/7/20/17591130/editors-choice-lg-g7-thinq-phone-review#comments
getting page https://www.theverge.com/2019/2/21/18230041/samsung-galaxy-s10-vs-s8-s9-upgrade-specs-price-comparison-camera
getting page https://www.theverge.com/2019/4/17/18410833/microsoft-surface-hub-2-price-size-specs-features-hands-on-video
getting page https://www.theverge.com/2018/10/15/17973484/google-pixel-3-xl-review-camera-features-screen-battery-price-photos#comments
getting page https://www.theverge.com/2018/6/27/17504714/blackberry-key2-review-smartphone-a