# Zadanie 3.
Napisz własny system do indeksowania stron internetowych, 
który: <br/>
- przegląda strony i zapamiętuje liczbę wystąpień poszczególnych słów na poszczególnych stronach
- zachowuje się podobnie jak pythonowy słownik, gdzie kluczem jest słowo, a wartością lista stron na których to słowo występuje (bądź lista pusta).
Strony powinny być uszeregowane malejąco względem podanej liczby wystąpień. <br/>
Możesz też zaproponować własną strategię rankowania stron. <br/>
Zakładamy, że indeksujemy tylko stronę wskazaną jako parametr odpowiedniej <br/>
funkcji czy metody, oraz strony do których da się dojść po linkach i href w nie <br/>
więcej niz z góry zadana liczba kroków. <br/>

In [2]:
# Set up
from collections import OrderedDict 
from operator import itemgetter
import urllib
from bs4 import BeautifulSoup
from bs4.element import Comment
from collections import Counter
from string import punctuation
import re
from multiprocessing import Pool

In [3]:
# dictionary of all words
WORDS = {}

In [4]:
def make_soup(url):
    try:
        page = urllib.request.urlopen(url)
    except urllib.request.HTTPError as e:
        print('Ignored: ', e)
        return -1
        
    req = urllib.request.Request(url , headers={'User-Agent': 'Mozilla/5.0'})
    page = urllib.request.urlopen(req).read()
    soup_data = BeautifulSoup(page, "html.parser")
    return soup_data


def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(url):
    soup = make_soup(url)
    if soup == -1:
        return ''
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)


def get_words(url):
    text = text_from_html(url)
    # Delete punctuaction and empty words
    text = [re.sub('\W+', '', t) for t in text.lower().split()]
    text = list(filter(lambda t: len(t), text))
    return text

def get_hrefs(url):
    soup = make_soup(url)
    if soup == -1:
        return []
    hrefs = []
        
    for link in soup.find_all('a'):
        hrefs.append(link.get('href'))
        
    ans = []
    for link in hrefs:
        if link is None or len(link) == 0:
            continue
        if link.find('https') != -1 or link.find('http') != -1:
            ans.append(link)
        else:
            sub_sites = link.split('/')
            if len(sub_sites) > 1:
                key_word = '/' + sub_sites[1] + '/'
                pos = url.find(key_word)
                if pos != -1:
                    ans.append(url[:pos] + link)
                else:
                    ans.append(url + link)
          
    return ans

def DFS(url, step, max_step, no, sites_no):
    if step >= max_step:
        return
    
    print(f'{no} / {sites_no} -> : ', url)
        
    words = get_words(url)
    hrefs = get_hrefs(url)
    
    for w in words:
        WORDS.setdefault(w, set()).add(url)
        
    for no, ref in enumerate(hrefs):
        DFS(ref, step + 1, max_step, no + 1, len(hrefs))
        
def get_words():
    return dict(sorted(WORDS.items(), key=lambda x: len(x[1]), reverse=True))

In [5]:
# Pick a site
site = 'https://en.wikipedia.org/wiki/Special:Random'
site2 = 'https://en.wikipedia.org/wiki/Mercury_Cyclone'
site3 = 'https://en.wikipedia.org/wiki/Alpman'

In [226]:
# Let's see all pages available from starting one 
WORDS = {}
steps = 2
DFS(site3, 0, steps, 1, 1)

1 / 1 -> :  https://en.wikipedia.org/wiki/Alpman
1 / 59 -> :  https://en.wikipedia.org/wiki/Ayten_Alpman
2 / 59 -> :  https://en.wikipedia.org/wiki/Fatma_Serpil_Alpman
3 / 59 -> :  https://en.wikipedia.org/wiki/Surname
4 / 59 -> :  https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/Alpman&namespace=0
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
5 / 59 -> :  https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Linking
6 / 59 -> :  https://en.wikipedia.org/wiki/Given_name
7 / 59 -> :  https://en.wikipedia.org/w/index.php?title=Alpman&oldid=831244460
8 / 59 -> :  https://en.wikipedia.org/wiki/Help:Category
9 / 59 -> :  https://en.wikipedia.org/wiki/Category:Surnames
10 / 59 -> :  https://en.wikipedia.org/wiki/Category:Articles_with_short_description
11 / 59 -> :  https://en.wikipedia.org/wiki/Category:All_set_index_articles
12 / 59 -> :  https://en.wikipedia.org/wiki/Special:MyTalk
13 / 59 -> :  https://en.wikipedia.org/wiki/Special:MyContributi

In [229]:
print(f'In total collected {len(WORDS)} words')

In total collected 11533 words


In [282]:
WORDS_SORTED = get_words()

In [284]:
pair = next(iter(x.items()))
print(f'Most common word is \'{pair[0]}\' and occured {len(pair[1])} times!')

Most common word is 'from' and occured 45 times!


# 2. Speed up by using multiprocessing