In [12]:
import re
import time
import urllib
import requests
import networkx as nx
from bs4 import BeautifulSoup

In [13]:
BASE_URL = "https://en.wikipedia.org"
START_URL = "wiki/Special:Random"

URL = urllib.parse.urljoin(BASE_URL, START_URL)

In [23]:
class Crawler:
    
    
    def __init__(self, current_url, base_url, depth):
        self.current_url = current_url
        self.crawled = set()
        self.to_crawl = []
        self.base_url = base_url
        self.crawling_graph = nx.DiGraph()
        self.num_crawled_pages = 0
        self.depth = depth
        self.rotten_links = []
   

    def same_domain_absolute_path(self, url, domain="/wiki"):
        if url["href"].startswith(domain):
            url = urllib.parse.urljoin(self.base_url, url["href"])
            return url
        
        
    def check_domain(self, url, domain):
        dom = urllib.parse.urlparse(url).netloc
        if dom == domain:
            return "internal"
        else:
            return "external"
  

    def add_links(self, url, domain:str):
        try:
            self.crawling_graph[self.current_url][domain].append(url)
        except KeyError:
            self.crawling_graph[self.current_url] = {"internal": [], "external": []}
            self.crawling_graph[self.current_url][domain].append(url)
    
    
    def url_filters(self, url):
        regex_extension = re.compile(r"\..{0,4}$")
        regex_special = """\
                            wiki/Template:|
                            wiki/Template_talk:|
                            wiki/Special:|
                            wiki/Wikipedia:|
                            wiki/Category:|
                            wiki/Help:|
                            wiki/Portal:
                        """
        regex_special = re.sub("^\s+", "", regex_special, flags=re.MULTILINE)
        regex_special = re.compile(regex_special)

        if re.search(regex_extension, url["href"]) and "wiki" in url["href"]:
            return
        elif re.search(regex_special, url["href"]):
            return
        else:
            return url
        
        
    def try_external_link(self, url):
        print(url)
        s = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
            'Accept': 'text/html, application/xhtml+xml, application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Referrer': 'https://www.google.fr',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'fr-CH, fr;q=0.9, en-US,en;q=0.9, de;q=0.7, *;q=0.5'
        }
        try:
            req = requests.Request('GET', url, headers=headers)
            prep_req = req.prepare()
            r = s.send(prep_req)
        except requests.exceptions.ConnectionError:
            return "Connection refused"
        except requests.Timeout:
            return "Connection refused"
        return r.status_code
            
    
    def crawl(self):
        # ----- REGEX DEFINITION -----
        regex_no_hash = re.compile("^(?!#)")
        
        # ----- GET ALL LINKS FOR CURRENT URL -----
        r = requests.get(self.current_url)
        self.current_url = r.url
        self.num_crawled_pages += 1
        self.crawled.add(self.current_url)
        current_url_path = urllib.parse.urlparse(self.current_url).path
        self.crawling_graph.add_node(current_url_path)
        soup = BeautifulSoup(r.text, "lxml")
        body = soup.find("div", attrs={"id": "mw-content-text"})
        urls = body.find_all("a", href=regex_no_hash)
        
        for url in urls:
            url = self.url_filters(url)
            if url is not None:
                url = urllib.parse.urljoin(self.base_url, url["href"])
                domain_status = self.check_domain(url, urllib.parse.urlparse(self.current_url).netloc)
                self.crawling_graph.add_edge(current_url_path, urllib.parse.urlparse(url).path, **{"domain": domain_status})
                
                if url not in self.crawled and domain_status == "internal":
                    self.to_crawl.append(url)
                if domain_status == "external":
                    code = self.try_external_link(url)
                    self.rotten_links.append((self.current_url, url, code))
                
        time.sleep(1)
        print(self.num_crawled_pages)
                
        
    def queue(self):
        
        self.to_crawl.append(self.current_url)
        
        for level in range(self.depth):
            print("Level of crawling : ", level)
            for count in range(len(self.to_crawl)):
                self.current_url = self.to_crawl.pop(0)
                self.crawl()
                if self.num_crawled_pages == 1000:
                    break

In [24]:
c = Crawler(URL, BASE_URL, 2)

In [25]:
c.queue()

Level of crawling :  0
http://www.nzs.si
1
Level of crawling :  1
https://www.google.com/search?as_eq=wikipedia&q=%22Slovenian+Republic+Football+League%22
https://www.google.com/search?tbm=nws&q=%22Slovenian+Republic+Football+League%22+-wikipedia
https://www.google.com/search?&q=%22Slovenian+Republic+Football+League%22+site:news.google.com/newspapers&source=newspapers
https://www.google.com/search?tbs=bks:1&q=%22Slovenian+Republic+Football+League%22+-wikipedia
https://scholar.google.com/scholar?q=%22Slovenian+Republic+Football+League%22
https://www.jstor.org/action/doBasicSearch?Query=%22Slovenian+Republic+Football+League%22&acc=on&wc=on
http://www.league321.com/slovenia-football.html
2
http://www.tuttotrieste.net/prsng/grezar.htm
http://www.nzs.si/tekmovanja/?action=vsiKlubi&id_menu=35
http://www.uefa.com/uefaeuropaleague/season=1992/clubs/club=57491/matches/index.html
3
4
http://stadioni.org/kamnik/igrisce_virtusa_na_smarci
http://www.dlib.si/stream/URN:NBN:SI:DOC-WFHABV95/16ba3def-d

KeyboardInterrupt: 

In [26]:
c.rotten_links

[('https://en.wikipedia.org/wiki/1989%E2%80%9390_Slovenian_Republic_League',
  'http://www.nzs.si',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?as_eq=wikipedia&q=%22Slovenian+Republic+Football+League%22',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?tbm=nws&q=%22Slovenian+Republic+Football+League%22+-wikipedia',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?&q=%22Slovenian+Republic+Football+League%22+site:news.google.com/newspapers&source=newspapers',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://www.google.com/search?tbs=bks:1&q=%22Slovenian+Republic+Football+League%22+-wikipedia',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Republic_League',
  'https://scholar.google.com/scholar?q=%22Slovenian+Republic+Football+League%22',
  200),
 ('https://en.wikipedia.org/wiki/Slovenian_Re