# Zadanie 3.
Napisz własny system do indeksowania stron internetowych, 
który: <br/>
- przegląda strony i zapamiętuje liczbę wystąpień poszczególnych słów na poszczególnych stronach
- zachowuje się podobnie jak pythonowy słownik, gdzie kluczem jest słowo, a wartością lista stron na których to słowo występuje (bądź lista pusta).
Strony powinny być uszeregowane malejąco względem podanej liczby wystąpień. <br/>
Możesz też zaproponować własną strategię rankowania stron. <br/>
Zakładamy, że indeksujemy tylko stronę wskazaną jako parametr odpowiedniej <br/>
funkcji czy metody, oraz strony do których da się dojść po linkach i href w nie <br/>
więcej niz z góry zadana liczba kroków. <br/>

In [1]:
# Set up
from collections import OrderedDict 
from operator import itemgetter
import urllib
from bs4 import BeautifulSoup
from bs4.element import Comment
from collections import Counter
from string import punctuation
import re
from multiprocessing import Pool
import multiprocessing
from collections import deque
import math
import tqdm

In [2]:
WORDS = {}
visited = set()

def make_soup(url):
    try:
        page = urllib.request.urlopen(url)
    except urllib.request.HTTPError as e:
        print('Ignored: ', e)
        return -1
        
    req = urllib.request.Request(url , headers={'User-Agent': 'Mozilla/5.0'})
    page = urllib.request.urlopen(req).read()
    soup_data = BeautifulSoup(page, "html.parser")
    return soup_data


def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(url):
    soup = make_soup(url)
    if soup == -1:
        return ''
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)


def get_words(url):
    text = text_from_html(url)
    # Delete punctuaction and empty words
    text = [re.sub('\W+', '', t) for t in text.lower().split()]
    text = list(filter(lambda t: len(t), text))
    return text

def get_hrefs(url):
    soup = make_soup(url)
    if soup == -1:
        return []
    hrefs = []
        
    for link in soup.find_all('a'):
        hrefs.append(link.get('href'))
        
    ans = []
    for link in hrefs:
        if link is None or len(link) == 0:
            continue
        if link.find('https') != -1 or link.find('http') != -1:
            ans.append(link)
        else:
            sub_sites = link.split('/')
            if len(sub_sites) > 1:
                key_word = '/' + sub_sites[1] + '/'
                pos = url.find(key_word)
                if pos != -1:
                    ans.append(url[:pos] + link)
                else:
                    ans.append(url + link)
          
    return ans

def get_dict():
    return dict(sorted(WORDS.items(), key=lambda x: len(x[1]), reverse=True))

def BFS(url, max_steps):
    Q = deque()
    Q.append((url, 0))
    visited = set()
    visited.add(url)
    
    while len(Q):
        act_url = Q[0][0]
        act_steps = Q[0][1]
        Q.popleft()
    
        print(f'Remaining: {len(Q)}. Act: ', act_url)
        
        words = get_words(act_url)
        hrefs = get_hrefs(act_url)
    
        for w in words:
            WORDS.setdefault(w, set()).add(act_url)

        for no, ref in enumerate(hrefs):
            if act_steps < max_steps and not ref in visited:
                Q.append((ref, act_steps + 1))
                visited.add(ref)

In [3]:
def scrap(url, depth):
    WORDS.clear()
    BFS(url, depth)
    return get_dict()

In [4]:
# Pick a site
site = 'https://en.wikipedia.org/wiki/Special:Random'
site2 = 'https://en.wikipedia.org/wiki/Mercury_Cyclone'
site3 = 'https://en.wikipedia.org/wiki/Alpman'

In [5]:
%%time
# Let's see all pages available from starting one 
words = scrap(site3, 1)

Remaining: 0. Act:  https://en.wikipedia.org/wiki/Alpman
Remaining: 55. Act:  https://en.wiktionary.org/wiki/Alpman
Remaining: 54. Act:  https://en.wikipedia.org/wiki/Ayten_Alpman
Remaining: 53. Act:  https://en.wikipedia.org/wiki/Fatma_Serpil_Alpman
Remaining: 52. Act:  https://en.wikipedia.org/wiki/Surname
Remaining: 51. Act:  https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/Alpman&namespace=0
Remaining: 50. Act:  https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Linking
Remaining: 49. Act:  https://en.wikipedia.org/wiki/Given_name
Remaining: 48. Act:  https://en.wikipedia.org/w/index.php?title=Alpman&oldid=925236056
Remaining: 47. Act:  https://en.wikipedia.org/wiki/Help:Category
Remaining: 46. Act:  https://en.wikipedia.org/wiki/Category:Surnames
Remaining: 45. Act:  https://en.wikipedia.org/wiki/Category:Articles_with_short_description
Remaining: 44. Act:  https://en.wikipedia.org/wiki/Category:All_set_index_articles
Remaining: 43. Act:  https://en.wikipedia.

In [6]:
print(f'In total collected {len(words)} words')

pair = next(iter(words.items()))
print(f'Most common word is \'{pair[0]}\' and occured on {len(pair[1])} websites!')

In total collected 11848 words
Most common word is 'from' and occured on 48 websites!


# 2. Speed up by using multiprocessing

In [18]:
def get_multi_dict():
    return dict(sorted(WORDS.items(), key=lambda x: x[1], reverse=True))

def BFS_multi(url, max_steps):
    cores = multiprocessing.cpu_count() - 1
    cores = 50
    sites = [url]
    all_sites = [url]
    
    for no in range(max_steps):
        act_sites = []
        print(f'Step {no} out of {max_steps}')
        for i in tqdm.tqdm(range(0, len(sites), cores), desc='Looking for sites', 
                       position=0, leave=True):
            batch = sites[i: min(i + cores, len(sites))]
            with Pool(cores) as p:
                next_sites = p.map(get_hrefs, batch)
                for p_site in next_sites:
                    act_sites += p_site

        sites = act_sites
        all_sites += act_sites

    sites = all_sites
    words = []
    for i in tqdm.tqdm(range(0, len(sites), cores), desc='Scrapping words', 
                       position=0, leave=True):
        batch = sites[i: min(i + cores, len(sites))]
        with Pool(cores) as p:
            act_words = p.map(get_words, batch)
            words += act_words


    for word_list in tqdm.tqdm(words, desc='Counting words', position=0, leave=True):
        for w in word_list:
            WORDS[w] = WORDS.get(w, 0) + 1


    print('\n\n---SITES---\n\n')
    for i, s in enumerate(sites):
        print(f'{i} / {len(sites)} -> : ', s)

In [19]:
def multi_scrap(url, depth):
    WORDS.clear()
    BFS_multi(url, depth)
    return get_multi_dict()

In [21]:
%%time
if __name__ ==  '__main__':
    lecture_site = 'http://www.ii.uni.wroc.pl/~marcinm/'
    words = multi_scrap(site3, depth=1)
    
    print(f'In total collected {len(words)} words')

    pair = next(iter(words.items()))
    print(f'Most common word is \'{pair[0]}\' and occured {pair[1]} times!')

Looking for sites:   0%|          | 0/1 [00:00<?, ?it/s]

Step 0 out of 1


Looking for sites: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
Scrapping words:   0%|          | 0/2 [00:00<?, ?it/s]

Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 403: Forbidden


Scrapping words: 100%|██████████| 2/2 [00:05<00:00,  2.72s/it]
Counting words: 100%|██████████| 63/63 [00:00<00:00, 1111.66it/s]



---SITES---


0 / 63 -> :  https://en.wikipedia.org/wiki/Alpman
1 / 63 -> :  https://en.wiktionary.org/wiki/Alpman
2 / 63 -> :  https://en.wikipedia.org/wiki/Ayten_Alpman
3 / 63 -> :  https://en.wikipedia.org/wiki/Fatma_Serpil_Alpman
4 / 63 -> :  https://en.wikipedia.org/wiki/Surname
5 / 63 -> :  https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/Alpman&namespace=0
6 / 63 -> :  https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Linking
7 / 63 -> :  https://en.wikipedia.org/wiki/Given_name
8 / 63 -> :  https://en.wikipedia.org/w/index.php?title=Alpman&oldid=925236056
9 / 63 -> :  https://en.wikipedia.org/wiki/Help:Category
10 / 63 -> :  https://en.wikipedia.org/wiki/Category:Surnames
11 / 63 -> :  https://en.wikipedia.org/wiki/Category:Articles_with_short_description
12 / 63 -> :  https://en.wikipedia.org/wiki/Category:All_set_index_articles
13 / 63 -> :  https://en.wikipedia.org/wiki/Category:Monitored_short_pages
14 / 63 -> :  https://en.wikipedia.org/wiki/Special


