In [1]:
from urllib.parse import urlparse
from urllib.parse import urldefrag
from urllib.request import urlopen
from file_storage import FileStorage
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from IPython.display import clear_output
import os

In [2]:
def download_from_the_internet(url):
    try:
        return urlopen(url).read().decode('utf-8')
    except KeyboardInterrupt:
        raise
    except:
        return None

def extract_links_from_html(url, html):
    parser = BeautifulSoup(html)
    return [urljoin(url, link.get('href')) for link in parser.findAll('a')]

In [18]:
class WikiDownloader:
    def __init__(self, filters, storage):
        self._filters = filters
        self._storage = storage
        self._active_urls = []
        self._counters = []
        self._active_links_lens = []
        self._print_counter = 0

    def start_with_seed(self, seed):
        self._active_urls = [seed]
        self.process()
    
    def start_with_urls(self, urls):
        self._active_url = urls
        self.process()

    def process(self):
        while len(self._active_urls) > 0:
            self._counters.append(self._storage.count())
            self._active_links_lens.append(len(self._active_urls))
            clear_output()
            print(self._counters)
            print(self._active_links_lens)
            self.step()

    def process_url(self, url):
        try:
            self._print_counter += 1
            if self._print_counter == 100:
                self._print_counter = 0
                clear_output()
            print("process url", url)
            html = download_from_the_internet(url)
            self._storage.write(url, html)
            return get_urls_from_html(url, html)
        except KeyboardInterrupt:
            raise
        except:
            return []

    def get_urls_from_html(self, url, html):
        candidates_urls = extract_links_from_html(url, html)
        urls = set()
        for candidate_url in candidates_urls:                
            if not storage.contains(candidate_url):
                for url_filter in self._filters:
                    if not url_filter(candidate_url):
                        break
                else:
                    urls.add(candidate_url)
        return urls

    def get_active_urls(self, max_htmls):
        self._active_urls = set()
        count = storage.count()
        for i, url in enumerate(storage.dict.keys()):
            if i % 100 == 0:
                clear_output()
                print("process {} / {}".format(i, count))
            html = storage.read(url)
            urls = self.get_urls_from_html(url, html)
            self._active_urls.update(urls)
            if i > max_htmls:
                break

    def read_urls(self, filename):
        self._active_urls = set()
        with open(filename) as handler:
            for line in handler:
                self._active_urls.add(line.strip())

    def step(self):
        new_active_urls = set()
        for url in self._active_urls:
            new_active_urls.update(self.process_url(url))
        self._active_urls = new_active_urls

In [4]:
SEED = "https://simple.wikipedia.org/wiki/Main_Page"
STORAGE_NAME = "hw_first_try"
#for path in ["{}.dict".format(STORAGE_NAME), "{}.data".format(STORAGE_NAME)]:
#    if os.path.exists(path):
#        os.remove(path)

In [5]:
filters = [
    lambda url: url.startswith("https://simple.wikipedia.org/wiki/"),
    lambda url: not urlparse(url).path.startswith("/wiki/Wikipedia:"),
    lambda url: not urlparse(url).path.startswith("/wiki/Help:"),
    lambda url: not urlparse(url).path.startswith("/wiki/Special:"),
    lambda url: not urlparse(url).path.startswith("/wiki/File:"),
    lambda url: not urlparse(url).path.startswith("/wiki/Talk:"),
    lambda url: not urlparse(url).path.startswith("/wiki/T:"),
    lambda url: not urlparse(url).path.startswith("/wiki/User:"),
    lambda url: not urlparse(url).path.startswith("/wiki/Template:"),
    lambda url: not urlparse(url).path.startswith("/wiki/Template_talk:"),
    lambda url: not urlparse(url).path.startswith("/wiki/User_talk:"),
    lambda url: urlparse(url).fragment == "",
]

In [19]:
storage = FileStorage(STORAGE_NAME)
downloader = WikiDownloader(filters, storage)

In [7]:
#downloader.start_with_seed(SEED)

In [48]:
downloader.get_active_urls(1e10)
print(len(downloader._active_urls))

process 221600 / 221609
312


In [49]:
print(len(downloader._active_urls))

312


In [25]:
with open("active_urls_61489", "w") as handler:
    for url in downloader._active_urls:
        handler.write(url + "\n")

In [26]:
downloader.read_urls("active_urls_61489")

In [50]:
downloader.process()

process url https://simple.wikipedia.org/wiki/European_route_E30
process url https://simple.wikipedia.org/wiki/U.S._50_in_California
process url https://simple.wikipedia.org/wiki/Rudolf_Schumann
process url https://simple.wikipedia.org/wiki/San_Diego_Gulls
process url https://simple.wikipedia.org/wiki/Aveiro,_Portugal
process url https://simple.wikipedia.org/wiki/Interstate_670_(Kansas%E2%80%93Missouri)
process url https://simple.wikipedia.org/wiki/Louisiana_Highway_1
process url https://simple.wikipedia.org/wiki/Raigad
process url https://simple.wikipedia.org/wiki/Parable_of_the_Pearl
process url https://simple.wikipedia.org/wiki/Parable_of_the_Unjust_Judge
process url https://simple.wikipedia.org/wiki/Illinois_Route_21
process url https://simple.wikipedia.org/wiki/U.S._Route_431_in_Alabama
process url https://simple.wikipedia.org/wiki/U.S._Route_17_in_Florida
process url https://simple.wikipedia.org/wiki/Svalbard_and_Jan_Mayen
process url https://simple.wikipedia.org/wiki/Interstate_

In [51]:
storage.count()

221610

In [46]:
downloader._active_links_lens

[75624, 61489, 49777, 4035, 637, 314]

In [52]:
downloader._counters

[138260, 154561, 168070, 217559, 221282, 221607, 221609]

In [54]:
downloader._storage.count()

221610

In [59]:
i = 0
for url in downloader._storage.dict.keys():
    #if urlparse(url).path.startswith("/wiki/Category:") or ("#" in urlparse(url).path):
    if ("#" in urlparse(url).path):
        print(url)
        if i > 100:
            break
        i += 1

In [63]:
a = "https://simple.wikipedia.org/wiki/Pig"
b = "https://simple.wikipedia.org/wiki/Pigs"

In [64]:
a in downloader._storage.dict.keys(), b in downloader._storage.dict.keys()

(True, True)

In [None]:
urlparse(url).path.startswith("/wiki/Category:")

In [66]:
"wgRedirectedFrom" in downloader._storage.read(a)

False

In [67]:
"wgRedirectedFrom" in downloader._storage.read(b)

True

In [68]:
final_storage = FileStorage("final_storage")

In [69]:
for i, url in enumerate(downloader._storage.dict.keys()):
    if i % 1000 == 0:
        clear_output()
        print(i)
    html = downloader._storage.read(url)
    if ("wgRedirectedFrom" not in html) and (not urlparse(url).path.startswith("/wiki/Category:")):
        final_storage.write(url, html)

221000


In [70]:
final_storage.count()

142806