In [1]:
import re
import time
import urllib
import requests
from datetime import datetime
from mongoengine import connect
import networkx as nx
from bs4 import BeautifulSoup
from db_model import WikiUrls

In [2]:
BASE_URL = "https://en.wikipedia.org"
START_URL = "wiki/Special:Random"

URL = urllib.parse.urljoin(BASE_URL, START_URL)

In [6]:
class Crawler:
    
    
    def __init__(self, current_url, base_url, depth):
        self.current_url = current_url
        self.crawled = set()
        self.to_crawl = []
        self.base_url = base_url
        self.crawling_graph = nx.DiGraph()
        self.num_crawled_pages = 0
        self.depth = depth
        self.rotten_links = []
   

    def to_db(self, current_url, url, code):
        db = connect('wiki_urls_db', alias='core')
        const_db = WikiUrls(original_url=current_url,
                            current_url=url,
                            status_code=str(code),
                            date_crawl=datetime.now())
        const_db.save()

        
    def same_domain_absolute_path(self, url, domain="/wiki"):
        if url["href"].startswith(domain):
            url = urllib.parse.urljoin(self.base_url, url["href"])
            return url
        
        
    def check_domain(self, url, domain):
        dom = urllib.parse.urlparse(url).netloc
        if dom == domain:
            return "internal"
        else:
            return "external"
  

    def add_links(self, url, domain:str):
        try:
            self.crawling_graph[self.current_url][domain].append(url)
        except KeyError:
            self.crawling_graph[self.current_url] = {"internal": [], "external": []}
            self.crawling_graph[self.current_url][domain].append(url)
    
    
    def url_filters(self, url):
        regex_extension = re.compile(r"\..{0,4}$")
        regex_special = """\
                            wiki/Template:|
                            wiki/Template_talk:|
                            wiki/Special:|
                            wiki/Wikipedia:|
                            wiki/Category:|
                            wiki/Help:|
                            wiki/Portal:
                        """
        regex_special = re.sub("^\s+", "", regex_special, flags=re.MULTILINE)
        regex_special = re.compile(regex_special)

        if re.search(regex_extension, url["href"]) and "wiki" in url["href"]:
            return
        elif re.search(regex_special, url["href"]):
            return
        else:
            return url
        
        
    def try_external_link(self, url):
        print(url)
        s = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
            'Accept': 'text/html, application/xhtml+xml, application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Referrer': 'https://www.google.fr',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'fr-CH, fr;q=0.9, en-US,en;q=0.9, de;q=0.7, *;q=0.5'
        }
        try:
            req = requests.Request('GET', url, headers=headers)
            prep_req = req.prepare()
            r = s.send(prep_req)
        except requests.exceptions.ConnectionError:
            return "Connection refused"
        except requests.Timeout:
            return "Connection refused"
        except InvalidSchema:
            return "Not HTTP protocol"
        return r.status_code
            
    
    def crawl(self):
        # ----- REGEX DEFINITION -----
        regex_no_hash = re.compile("^(?!#)")
        
        # ----- GET ALL LINKS FOR CURRENT URL -----
        r = requests.get(self.current_url)
        self.current_url = r.url
        self.num_crawled_pages += 1
        self.crawled.add(self.current_url)
        current_url_path = urllib.parse.urlparse(self.current_url).path
        self.crawling_graph.add_node(current_url_path)
        soup = BeautifulSoup(r.text, "lxml")
        body = soup.find("div", attrs={"id": "mw-content-text"})
        urls = body.find_all("a", href=regex_no_hash)
        
        for url in urls:
            url = self.url_filters(url)
            if url is not None:
                url = urllib.parse.urljoin(self.base_url, url["href"])
                domain_status = self.check_domain(url, urllib.parse.urlparse(self.current_url).netloc)
                self.crawling_graph.add_edge(current_url_path, urllib.parse.urlparse(url).path, **{"domain": domain_status})
                
                if url not in self.crawled and domain_status == "internal":
                    self.to_crawl.append(url)
                if domain_status == "external":
                    code = self.try_external_link(url)
                    self.rotten_links.append((self.current_url, url, code))
                    self.to_db(*(self.current_url, url, code))
                
        time.sleep(1)
        print(self.num_crawled_pages)
                
        
    def queue(self):
        
        self.to_crawl.append(self.current_url)
        
        for level in range(self.depth):
            print("Level of crawling : ", level)
            for count in range(len(self.to_crawl)):
                self.current_url = self.to_crawl.pop(0)
                self.crawl()
                if self.num_crawled_pages == 200:
                    break

In [7]:
c = Crawler(URL, BASE_URL, 2)

In [None]:
c.queue()

Level of crawling :  0
http://www.ngs.noaa.gov/cgi-bin/ds_mark.prl?PidBox=aa3449
https://web.archive.org/web/20150924061607/http://www.ordnancesurvey.co.uk/docs/support/guide-coordinate-systems-great-britain.pdf
http://www.ordnancesurvey.co.uk/docs/support/guide-coordinate-systems-great-britain.pdf
http://home.hiwaay.net/~taylorc/bookshelf/math-science/geodesy/positioning/index.html
https://ourarchive.otago.ac.nz/bitstream/handle/10523/1713/McPhailCameron2011MA.pdf
https://books.google.com/books?id=LVp_gkwyvC8C&pg=PA102
https://web.archive.org/web/20120806065207/http://wwp.millennium-dome.com/info/conference.htm
http://wwp.millennium-dome.com/info/conference.htm
https://spatialreference.org/ref/epsg/4326/
http://www.paulbolstad.net/5thedition/samplechaps/Chapter3_5th_small.pdf
https://web.archive.org/web/20110721130505/http://www.osi.ie/GetAttachment.aspx?id=25113681-c086-485a-b113-bab7c75de6fa
http://www.osi.ie/GetAttachment.aspx?id=25113681-c086-485a-b113-bab7c75de6fa
https://books.g

In [26]:
c.rotten_links

[('https://en.wikipedia.org/wiki/1989%E2%80%9390_Slovenian_Republic_League',
  'http://www.nzs.si',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?as_eq=wikipedia&q=%22Slovenian+Republic+Football+League%22',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?tbm=nws&q=%22Slovenian+Republic+Football+League%22+-wikipedia',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?&q=%22Slovenian+Republic+Football+League%22+site:news.google.com/newspapers&source=newspapers',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?tbs=bks:1&q=%22Slovenian+Republic+Football+League%22+-wikipedia',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://scholar.google.com/scholar?q=%22Slovenian+Republic+Football+League%22',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Re