# Information retrieval homework 1 - Crawler
https://github.com/IUCVLab/information-retrieval/blob/main/homeworks/2023/2023S-01%20-%20Crawling.ipynb

## Task 1.1 Download and persist

In [12]:
import requests
from urllib.parse import quote
from hashlib import sha512
import os
from urllib.parse import urlparse

class Document:
    def __init__(self, url):
        self.url = url
        self.file_name_hashed = ""
        self.type = ".txt"
    # get document via download, persist, and load
    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()
    # download self.url content, store it in self.content and return True in case of success
    def download(self):
        try:
            response = requests.get(self.url, allow_redirects=True)
            if response.status_code != 200:
                print(response.status_code, response.reason, 'for', self.url)
                return False
            self.content = response.content
            path = urlparse(self.url).path
            if path:
                self.type = os.path.splitext(path)[1]
            print(f"File downloaded from {self.url}")
            return True
        except:
            return False

    # write document content to hard drive, return True in case of success
    def persist(self):
        try:
            if self.file_name_hashed == "":
                self.file_name_hashed =  sha512(self.url.encode()).hexdigest() + self.type
            with open(self.file_name_hashed, 'wb') as f:
                f.write(self.content)
                print(f"File saved as {self.file_name_hashed}")
        except:
            return False

    # load content from hard drive, store it in self.content and return True in case of success
    def load(self):
        try:
            with open(self.file_name_hashed, 'rb') as f:
                self.content = f.read()
                return True
        except:
            return False

In [13]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"
print("Task 1.1 test Success")

File downloaded from http://sprotasov.ru/data/iu.txt
File saved as 399b014f7ddfc1e4b2721c246d3752b1159261ef9780f58f263bed08e521b28ca74c58c35806bd1871e1cb57b879a9da2e77e57dd3a17255a3f258b42377a365.txt
Task 1.1 test Success


In [14]:
new_doc = Document('http://commondatastorage.googleapis.com/codeskulptor-demos/DDR_assets/Kangaroo_MusiQue_-_The_Neverwritten_Role_Playing_Game.mp3')
new_doc.get()
assert new_doc.load(), "Load should return true for saved document"
print("Task 1.1 mp3 test Success")

File downloaded from http://commondatastorage.googleapis.com/codeskulptor-demos/DDR_assets/Kangaroo_MusiQue_-_The_Neverwritten_Role_Playing_Game.mp3
File saved as 26db2dd587b44363c8f6ee0acfd28dbf706e3e7a71722d7543d241e3203a58e2f3781ef242293a4c54bbffa6d46e397937ddc61f6257055ed8432d4435d9dec1.mp3
Task 1.1 mp3 test Success


In [15]:
third_doc = Document('https://innopolis.university/files/politicacookies.pdf?lang=ru&id=12&site=s1&template=university24&landing_mode=edit')
third_doc.get()
assert new_doc.load(), "Load should return true for saved document"
print("Task 1.1 pdf test Success")

File downloaded from https://innopolis.university/files/politicacookies.pdf?lang=ru&id=12&site=s1&template=university24&landing_mode=edit
File saved as 183e243dcc6c242023d3c8ed65c6334d252d4d0fda2bb74220d4bddc536085610530596d7f7cf4957094a3629b5d048fdd845b5ecc7f4298878a779b40359711.pdf
Task 1.1 pdf test Success


## Task 1.2 Parse HTML

In [16]:
from bs4 import BeautifulSoup
from bs4.element import Comment
from bs4 import SoupStrainer
import urllib.parse
import httplib2

class HTMLDocument(Document):

    def __init__(self, url):
        super().__init__(url)
        self.anchors = []
        self.images = []
        self.text = ""
    # check if element is visible
    def tag_visible(self, element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True
    # extract text from html
    def textFromHTML(self, body):
        soup = BeautifulSoup(body, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(self.tag_visible, texts)
        return u"".join(t.strip() for t in visible_texts)

    # extract plain text, images and links from the document
    def parse(self):
        try:
            http = httplib2.Http()
            status, response = http.request(self.url)
            self.text = self.textFromHTML(response)
            for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a') ):
                if(link.has_attr('href')):
                    self.anchors.append((link.text, link['href']))
            html = response.decode()
            soup = BeautifulSoup(html, 'html.parser')
            for img in soup.find_all('img'):
                src = img.get('src')
                if src:
                    src = urllib.parse.urljoin(self.url, src)
                    self.images.append(src)
            return True
        except:
            return False

In [17]:
doc = HTMLDocument('http://sprotasov.ru')
doc.get()
doc.parse()

assert "just few links" in doc.text, "Error parsing text"
assert any(p[1] == "https://twitter.com/07C3" for p in doc.anchors), "Error parsing links"
assert "http://sprotasov.ru/images/gb.svg" in doc.images, "Error parsing images"
print("Task 1.2 test Success")


File downloaded from http://sprotasov.ru
File saved as adb3427a17aef9dbfd24527adc85614193d6831828b7303941b457a289c9463bed5ffb2f5c5dd6100f6b467e21a810908839ee5b694502cff4d7d6956b407f9c.txt
Task 1.2 test Success


## Task 1.3 Document analysis

In [32]:
from collections import Counter
import re
class HtmlDocumentTextData:
    def __init__(self, url):
        self.doc = HTMLDocument(url)
        self.doc.get()
        self.doc.parse()

    # sentences parser
    def get_sentences(self):
        result = []
        soup = BeautifulSoup(self.doc.content, 'html.parser')
        tag = soup.body
        try:
            for s in tag.strings:
                result.append(s.strip().lower())
        except:
            print("ERROR IN PARSING {doc.url}")
        return result

    # return Counter object of the document, containing mapping {`word` -> count_in_doc}
    def get_word_stats(self):
        sentences = self.get_sentences()
        words = []
        for s in sentences:
            words += re.split(' ', s)
        for i in range( len(words) ):
            words[i] = re.sub(r'[^\w\s]', '', words[i].lower())
        words = filter(lambda x: x != '', words)
        return Counter(words)

In [28]:
doc = HtmlDocumentTextData("https://innopolis.university/")

print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] == 'иннополис'], 'иннополис should be among most common'
print("Task 1.3 test Success")

File downloaded from https://innopolis.university/
File saved as 09f4ea693dd9b25ed0ccc27a46a34a00d3faec660a4b0143f63a08a631ffbe17a817de31a1448b57a318c1aa6dd71c80633263149dd438ce4a663190574121bb
[('и', 44), ('в', 22), ('иннополис', 20), ('с', 13), ('на', 12), ('университет', 11), ('университета', 11), ('центр', 10), ('для', 9), ('образование', 8)]
Task 1.3 test Success


## Task 1.4 Crawler

In [29]:
from queue import Queue

class Crawler:
    # return generator of HtmlDocumentTextData objects as per depth specified
    def crawl_generator(self, source, depth=1):
        visited = {} # visited urls
        q = Queue() # queue of urls to process
        q.put( (0, source) )
        while(not q.empty()):
            try:
                current_website = q.get()
                if current_website[0] > depth:
                    break
                if current_website[1] in visited:
                    continue
                visited[current_website[1]] = True
                if current_website[1][-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
                    continue
                print(f"Working on {current_website}")
                doc_data = HtmlDocumentTextData(current_website[1])
                for childDoc in doc_data.doc.anchors:
                    q.put((current_website[0] + 1, childDoc[1]))
                print("Finished working on ", current_website)
                yield doc_data
            except Exception as e:
                continue

In [30]:
crawler = Crawler()
counter = Counter()

for c in crawler.crawl_generator("https://innopolis.university/en/", 2):
    print(c.doc.url)
    if c.doc.url[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
        print("Skipping", c.doc.url)
        continue
    counter.update(c.get_word_stats())
    print(len(counter), "distinct word(s) so far")

print("Done")
print(len(counter), "distinct words in total")
print(counter.most_common(20))
assert [x for x in counter.most_common(20) if x[0] == 'иннополис'], 'иннополис sould be among most common'
assert [x for x in counter.most_common(20) if x[0] == 'innopolis'], 'innopolis sould be among most common'

Done
30308 distinct words in total
[('the', 9727), ('and', 8694), ('и', 8345), ('of', 7961), ('в', 5773), ('in', 5141), ('to', 4794), ('for', 3131), ('university', 3114), ('на', 3064), ('a', 2882), ('с', 2482), ('иннополис', 2404), ('по', 2126), ('at', 1718), ('я', 1693), ('университета', 1684), ('is', 1649), ('i', 1614), ('innopolis', 1549)]


In [31]:
print("Task 1.4 test Success")

Task 1.4 test Success
