In [None]:
#default_exp crawler

In [None]:
#hide
from nbdev.showdoc import *

# Crawler
> Defines methods to crawl all web pages in a specific domain, extract contents from them and store them in a DataFrame 

In [None]:
#hide
#export
from search_engine.scrapper import parse_webpage
from urllib.parse import urljoin, urlparse

from collections import deque
import pandas as pd
import re

In [None]:
#hide
#export
def link_filter(link, domain, base_url):
    """
    Filters out links if they... \n
    1. Are not from specified domain \n
    2. Contain extensions - pdf|jpg|jpeg|doc|docx|ppt|pptx|png|txt|exe|ps|psb \n
    3. Contain an `@` \n
    4. Have already been visited
    """
    is_valid = lambda url: not bool(re.search('pdf|jpg|jpeg|doc|docx|ppt|pptx|png|txt|exe|ps|psb|@',
                                         url))
    return link is not None and is_valid(link) and (link.startswith('/') or domain in link) \
    and urljoin(base_url, link) != base_url

In [None]:
show_doc(link_filter)

<h4 id="link_filter" class="doc_header"><code>link_filter</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>link_filter</code>(**`link`**, **`domain`**, **`base_url`**)

Filters out links if they... 

1. Are not from specified domain 

2. Contain extensions - pdf|jpg|jpeg|doc|docx|ppt|pptx|png|txt|exe|ps|psb 

3. Contain an `@` 

4. Have already been visited

In [None]:
#hide
#export
def link_modifier(url, base_url):
    """
    Converts `relative` urls to absolute ones.
    """
    url = urljoin(base_url, url)
    if url[-1]=='/':
        url= url[:-1]
    if 'https' not in url:
        url = url.replace("http", "https")
    return url

In [None]:
show_doc(link_modifier)

<h4 id="link_modifier" class="doc_header"><code>link_modifier</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>link_modifier</code>(**`url`**, **`base_url`**)

Converts `relative` urls to absolute ones.

In [None]:
#hide
#export
def crawl(domain='uic.edu',
         url='https://cs.uic.edu',
         num_pages=5):
    """
    Starts crawling the specified url and linked urls in a breadth-first fashion,
    extracts content and puts them in a DataFrame that will be returned
    """
    # Queue links to crawl
    crawl_q = deque([])
    # Already crawled links
    crawled_links = set([])
    # Redundant crawl_q
    crawl_q_set = set(crawl_q)
    # Number of links crawled
    crawl_count = 0
    
    pages = pd.DataFrame(columns=['id', 'url', 'content', 'graph'])
    crawl_q.append(url)
    while len(crawl_q) > 0 and crawl_count < num_pages:
        try:
            crawl_q_set = set(crawl_q)
            url = crawl_q.popleft()
            if url in crawled_links:
                continue
            crawled_links.add(url)
            content, links = parse_webpage(url)
            # Remove invalid links
            links = list(filter(lambda link: link_filter(link, domain, url), links))
            # Modify relative urls to absolute
            links = list(map(lambda link: link_modifier(link, url), links))
            # Remove duplicates within the links
            links = list(set(links))

            pages = pages.append({'id': crawl_count, 'url': url, 'content': content, 'outgoing_links': links}, ignore_index=True)
            print(f'Crawled {url}')

            # Add links to crawl_q if they are not in crawled links or not already in crawl_q
            crawl_q.extend(list(filter(lambda l: l not in crawl_q_set and l not in crawled_links, links)))
            crawl_count += 1
        except:
            print('Error')
    
    # Clear all lists and queues
    crawl_q.clear()
    crawled_links = set([])
    crawl_q_set = set(crawl_q)
    
    return pages

In [None]:
show_doc(crawl)

<h4 id="crawl" class="doc_header"><code>crawl</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>crawl</code>(**`domain`**=*`'uic.edu'`*, **`url`**=*`'https://cs.uic.edu'`*, **`num_pages`**=*`5`*)

Starts crawling the specified url and linked urls in a breadth-first fashion,
extracts content and puts them in a DataFrame that will be returned