# Crawler

This notebook contains started code structure for creating a crawler on single machine

**Author:** Noshaba Nasir

**Date:** 26/3/2021

**Updated by:** Muhammad Wasiq 17L-6315
    

In [1]:
# Add any library to be imported here
import os 
import re
import random
import requests
import threading
from time import time
from time import sleep
from numpy.random import choice
from bs4 import BeautifulSoup
from queue import Queue
from queue import PriorityQueue

import urllib.robotparser as robo
from urllib.parse import urlparse
from urllib.parse import  urljoin
from urllib.parse import urldefrag


In [2]:
# Crawler Parameters
BACKQUEUES = 3
THREADS = 3*BACKQUEUES
FRONTQUEUES = 5
WAITTIME = 15 ; # wait 15 seconds before fetching URLS from 

# Add any other global parameters here
SCRAPED_DATA = []
CRAWLERD_ROBOTS = {}
CRAWLED_URLS = set()
CRAWL_LIMIT = 100

# FRONTIER

Frontier should use the Mercator frontier design as discussed in lecture.

Preferably it should be a class and should have the given functions.

*prioritizer* function is a stub right now, it will return a random number  between 1 to f for given URL 

In [3]:
class frontier:
# add the code for frontier here
# should have functions __init__, get_URL, add_URLs, add_to_backqueue
    def __init__(self, seeds, front_count, back_count):

        # initializing required data structures
        self.front_queues = [Queue() for _ in range(front_count)]
        self.back_queues = [Queue() for _ in range(back_count)]
        self.domain_map = {}
        self.back_selector = PriorityQueue()
        
        self.add_URLs(seeds)   # insert seeds in front queues

        # check if minimum number of seeds available
        unique_seeds = set()
        for url in seeds:
            domain = urlparse(url).netloc
            unique_seeds.add(domain)

        # throw error if not enough seeds
        if len(unique_seeds) < back_count:
            raise Exception('Not enough unique seed domains')
        
        # fill each back queue with atleast 1 seed
        for i in range(back_count):
            self.add_to_backqueue(i)            
            self.back_selector.put((time() - WAITTIME, i)) # intialize the heap as well
        
    # add more functions here
    def fetch_from_front(self):
        
        k = len(self.front_queues)
        total = (k * (k + 1)) // 2 # arithmatic sum

        # choose a queue to extract url from based on priority-weighted probability
        idx = choice([i for i in range(k)], 1, p=[(i+1)/total for i in range(k)])[0]
        while self.front_queues[idx].empty():
            idx = choice([i for i in range(k)], 1, p=[(i+1)/total for i in range(k)])[0]

        return self.front_queues[idx].get()

    def get_URL(self):

        timestamp, idx = self.back_selector.get()
        time_elapsed = time() - timestamp

        # calculate thread waiting time to maintain politeness
        sleep_time = 0
        if time_elapsed < WAITTIME:
            sleep_time = int(WAITTIME - time_elapsed)
        
        next_url = self.back_queues[idx].get()  # get new url
          
        if self.back_queues[idx].empty():
            # remove the domain assigned to i'th queue
            self.domain_map = { k : v for k,v in self.domain_map.items() if v != idx}
            
            # fetch new domain for back queue
            self.add_to_backqueue(idx)

        self.back_selector.put((time() + sleep_time, idx)) # update the timestamp

        return next_url, sleep_time
            

    def add_URLs(self, URLs):
        # insert all provided URLs into front queues, assuming they're new
        for url in URLs:
            priority = prioritizer(url, len(self.front_queues)) # assign priority to the URL
            self.front_queues[priority - 1].put(url)    # index using priority value to insert URl


    def add_to_backqueue(self, idx):
        
        while self.back_queues[idx].empty():
            # bring a url from a front queue to be added to a back queue
            url = self.fetch_from_front()
            url_domain = urlparse(url).netloc

            # insert fetched url in coresponding back queue
            if url_domain in self.domain_map:
                self.back_queues[self.domain_map[url_domain]].put(url)
            else:
                self.domain_map[url_domain] = idx
                self.back_queues[idx].put(url)


def prioritizer(URL,f):
    """
    Take URL and returns priority from 1 to F
    Right now it like a stub function. 
    It will return a random number from 1 to f for given inputs. 
    """
    return random.randint(1,f)
    

# URL Fetching and Filtering Pipeline

Get HTML by requesting given URL.
Parse HTML and extract all URLs.
Filter the URLS that are in robots.txt files of server and the have been already processed. 

In [1]:
def fetch_data(URL):
    # hit given URL and return response
    try:
        response = requests.get(URL)

        if 'Content-Type' in response.headers and 'html' not in response.headers['Content-Type']:
            raise Exception(f"Did not get a HTML response from {URl}")

        # save the html response as text file
        # file_name = '_'.join(URL.split('/')[2:])
        # with open(f'{file_name}.txt', 'w+', encoding="utf-8") as file:
        #      file.write(response.text)
            
        return response.content

    except Exception as e:
        raise Exception("Get Request Failed. " + str(e))


def parse_data(URL, html_doc):
    parsed_url = urlparse(URL)  
    domain = 'https://' + parsed_url.netloc
    
    soup = BeautifulSoup(html_doc, 'html.parser')   # use bs4 html parser

    # convert relative paths to absolutes if found
    links = [urljoin(domain, link.get('href')) for link in soup.find_all('a')] 

    # remove fragment identifiers from the links
    links = [urldefrag(link)[0] for link in links]  

    # remove duplicate links due to defragmentation
    links = list(set(links))    

    return links

def filter_URLs(URL, fetched_URLs):
  
    # politeness; only keep URLs that the domain allows
    filtered_urls = set()
    for url in fetched_URLs:
        # generating robots.txt URL
        hostname = urlparse(URL).netloc 

        robot_parser = None
        # if host already visited, lookup robot.txt
        if hostname in CRAWLERD_ROBOTS:
            robot_parser = CRAWLERD_ROBOTS[hostname]
        else:
            # if not, fetch robot.txt
            robot_link = 'http://' + hostname + '/robots.txt'
            robot_parser = robo.RobotFileParser(robot_link)
            robot_parser.read()

            CRAWLERD_ROBOTS[hostname] = robot_parser
            
        if robot_parser.can_fetch('*', url):
            filtered_urls.add(url)

    return list(filtered_urls)

def dup_elimination(url_set, filtered_URLs):
    unique_urls = []
    for url in filtered_URLs:
        if url in url_set:
            continue
        unique_urls.append(url)

    return unique_urls


In [5]:
# Theard task
mutex = threading.Lock()
def crawler_thread_task(frontier):

    while True:

        mutex.acquire()
        # critical section - start
        if len(CRAWLED_URLS) >= CRAWL_LIMIT:
            mutex.release()
            break
        
        curr_url, waiting_time = frontier.get_URL()
        CRAWLED_URLS.add(curr_url)

        print(f"[{threading.current_thread().name}] Got URL [{len(CRAWLED_URLS)}/{CRAWL_LIMIT}] ")
        
        # critical section - end
        mutex.release()

        try:
            sleep(waiting_time)
            curr_domain = urlparse(curr_url).netloc
            print(f"[{threading.current_thread().name}] Crawling {curr_domain} ... ") 

            data = fetch_data(curr_url)
            print(f"[{threading.current_thread().name}] Fetched {curr_url}")

            new_urls = parse_data(curr_url, data)
            print(f"[{threading.current_thread().name}] Saved {curr_url}")

            filtered_urls = filter_URLs(curr_url, new_urls)
            print(f"[{threading.current_thread().name}] Generating new URLs ... ")

            final_urls = dup_elimination(url_set=CRAWLED_URLS, filtered_URLs=filtered_urls)
            print(f"[{threading.current_thread().name}] Generated {len(final_urls)} new URLs.")

            frontier.add_URLs(final_urls)
            print(f"[{threading.current_thread().name}] Crawled {curr_domain}")

        except Exception as e:
            print(f"[{threading.current_thread().name}] Failed to Crawl {curr_url} ... ")
            print(f"[{threading.current_thread().name} Exception]", e)
            continue
    
    print(f"[{threading.current_thread().name}] Exiting ...")


In [6]:
# intialize every thing
seed_urls = [
    'https://docs.oracle.com/en/', 
    'https://www.oracle.com/corporate/',
    'https://en.wikipedia.org/wiki/Machine_learning',
    'https://www.csie.ntu.edu.tw/~cjlin/libsvm/index.html',
    'https://docs.oracle.com/middleware/jet210/jet/index.html',
    'https://en.wikipedia.org/w/api.php',
    'https://en.wikipedia.org/api/',
    'https://en.wikipedia.org/wiki/Weka_(machine_learning)'
    ]

URL_frontier = frontier(seeds=seed_urls, front_count=FRONTQUEUES, back_count=BACKQUEUES)


In [7]:
# start the threads
workers = []

for i in range(THREADS):
    workers.append(threading.Thread(target=crawler_thread_task, name = f't{i+1}', args=(URL_frontier,)))
    
for i in range(THREADS):
    workers[i].start()

for i in range(THREADS):
    workers[i].join()

print("[main] Scraped Count: ", len(CRAWLED_URLS))

print("[main] Scraped URLs:\n", CRAWLED_URLS)


 URL [55/100] 
[t3] Crawling zh.wikipedia.org ... 
[t3] Fetched https://zh.wikipedia.org/wiki/Weka
[t3] Saved https://zh.wikipedia.org/wiki/Weka
[t3] Generating new URLs ... 
[t3] Generated 70 new URLs.
[t3] Crawled zh.wikipedia.org
[t3] Got URL [56/100] 
[t4] Crawling en.wikipedia.org ... 
[t4] Fetched https://en.wikipedia.org/wiki/Cluster_analysis
[t4] Saved https://en.wikipedia.org/wiki/Cluster_analysis
[t4] Generating new URLs ... 
[t4] Generated 775 new URLs.
[t4] Crawled en.wikipedia.org
[t4] Got URL [57/100] 
[t5] Crawling www.wikimediafoundation.org ... 
[t5] Fetched https://www.wikimediafoundation.org/
[t5] Saved https://www.wikimediafoundation.org/
[t6] Crawling stats.wikimedia.org ... 
[t6] Fetched https://stats.wikimedia.org/
[t6] Saved https://stats.wikimedia.org/
[t5] Generating new URLs ... 
[t5] Generated 77 new URLs.
[t5] Crawled www.wikimediafoundation.org
[t5] Got URL [58/100] 
[t6] Generating new URLs ... 
[t6] Generated 0 new URLs.
[t6] Crawled stats.wikimedia.org
