# NB: This document contains master-level tasks

## 1. [M][15] Account the caching policy

Sometimes remote documents (especially when we speak about static content like `js` or `gif`) can swear that they will not change for some time. This is done by setting [Cache-Control response header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control).

In [1]:
import requests
requests.get('https://polyfill.io/v3/polyfill.min.js').headers['Cache-Control']

'public, s-maxage=31536000, max-age=604800, stale-while-revalidate=604800, stale-if-error=604800'

In [2]:
# paste here your Document class implementation from "01 - Crawling" file
import requests
from urllib.parse import quote

class Document:

    def __init__(self, url):
        self.url = url
        self.download_time = 0

    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()

    def download(self):
        #TODO download self.url content, store it in self.content and return True in case of success
        response = requests.get(self.url)
        if response.ok:
            self.content = response.content
            self.download_time = time.time()
            return True
        else:
            return False

    def persist(self):
        #TODO write document content to hard drive
        try:
            file = "from_" + self.url.split("/")[2].split(".")[0] + "_got_file_" + self.url.rsplit("/", 1)[1]
            with open(file, "wb") as f:
                f.write(self.content)
            return True
        except:
            return False

    def load(self):
        #TODO load content from hard drive, store it in self.content and return True in case of success
        try:
            file = "from_" + self.url.split("/")[2].split(".")[0] + "_got_file_" + self.url.rsplit("/", 1)[1]
            with open(file, "rb") as f:
                self.content = f.read()
            return True
        except:
            return False

Please study the documentation and implement a descendant to a `Document` class, which will refresh the document in case of expired cache even if the file is already on the hard drive.

In [3]:
import logging
import os
import time
import re
import datetime

class CachedDocument(Document):
    # TODO your code here
    def __init__(self, url):
        Document.__init__(self, url)

    def get(self):
        response = requests.get(self.url)
        # print(response.headers)
        file = "from_" + self.url.split("/")[2].split(".")[0] + "_got_file_" + self.url.rsplit("/", 1)[1]

        if "cache-control" in response.headers and re.split(", |=", response.headers["cache-control"])[0] == "public":
            max_age = int(re.split(", |=", response.headers["cache-control"])[4])
        else:
            # max_age = int(re.split("; |= ", response.headers["Strict-Transport-Security"])[1])
            max_age = int(re.split(", |: ", response.headers["NEL"])[3])

        # c_time = datetime.datetime.fromtimestamp(os.path.getctime(file)).time()
        # cached_time = float(datetime.timedelta(hours=c_time.hour, minutes=c_time.minute, seconds=c_time.second).total_seconds())
        cached_time = self.download_time
        age = round(time.time() - cached_time)

        if age > max_age:
            print("Refreshing...")
            logging.info("Refreshing...")
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()
        else:
            if not self.load():
                if not self.download():
                    raise FileNotFoundError(self.url)
                else:
                    self.persist()

### Tests

Add logging to your code and show that your code behaves differently for documents with different caching policy.

In [4]:
import time

doc = CachedDocument('https://polyfill.io/v3/polyfill.min.js')
doc.get()
time.sleep(2)
doc.get()
time.sleep(2)
doc.get()

doc = CachedDocument('https://yandex.ru/')
doc.get()
time.sleep(2)
doc.get()
time.sleep(2)
doc.get()

Refreshing...
Refreshing...


## 2. [M][35] Languages
Maybe you heard, that there are multiple languages in the world. European languages, like Russian and English, use similar puctuation, but even in this family there is ¡Spanish!

Other languages can use different punctiation rules, like **Arabic or [Thai](http://www.thai-language.com/ref/breaking-words)**.

Your task is to support (at least) three languages (English, Arabic, and Thai) tokenization in your `HtmlDocumentTextData` class descendant.

What should you do (acceptance criteria):
1. Use any language dection techniques, e.g. [langdetect](https://pypi.org/project/langdetect/).
2. Use language-specific tokenization tools, e.g. for [Thai](https://pythainlp.github.io/tutorials/notebooks/pythainlp_get_started.html#Tokenization-and-Segmentation) and [Arabic](https://github.com/CAMeL-Lab/camel_tools).
3. Use these pages to test your code: [1](https://www.bangkokair.com/tha/baggage-allowance) and [2](https://alfajr-news.net/details/%D9%85%D8%B4%D8%B1%D9%88%D8%B9-%D8%AF%D9%8A%D9%85%D9%88%D9%82%D8%B1%D8%A7%D8%B7%D9%8A-%D9%81%D9%8A-%D8%A7%D9%84%D9%83%D9%88%D9%86%D8%BA%D8%B1%D8%B3-%D8%A7%D9%84%D8%A3%D9%85%D8%B1%D9%8A%D9%83%D9%8A-%D9%84%D9%85%D8%B9%D8%A7%D9%82%D8%A8%D8%A9-%D8%A8%D9%88%D8%AA%D9%8A%D9%86).
4. Pass the tests.

In [None]:
!pip install langdetect
!pip install pythainlp
!pip install camel-tools

In [5]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.parse


class HtmlDocument(Document):

    def parse(self):
        #TODO extract plain text, images and links from the document
        # self.anchors = [("fake link text", "http://fake.url/")]
        # self.images = ["http://image.com/fake.jpg"]
        # self.text = "fake text and some other text"

        self.anchors = []
        self.images = []
        self.text = " "

        page = requests.get(self.url)
        soup = BeautifulSoup(page.content, "html.parser")

        for a in soup.find_all("a", href=True):
            url = urllib.parse.urljoin(self.url, a.get("href"))
            text = a.text
            self.anchors.append((text, url))

        for img in soup.find_all("img", src=True):
            src = urllib.parse.urljoin(self.url, img.get("src"))
            self.images.append(src)

        self.text = soup.text
        # tag = soup.body
        # for string in tag.strings:
        #     self.text = self.text + string.strip()



In [6]:
# paste here your HtmlDocumentTextData class implementation from "01 - Crawling" file
from collections import Counter
from string import punctuation
from string import digits

class HtmlDocumentTextData:

    def __init__(self, url):
        self.url = url
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()

    def get_sentences(self):
        #TODO implement sentence parser
        page = requests.get(self.url)
        soup = BeautifulSoup(page.content, "html.parser")

        result = " "
        tag = soup.body
        for string in tag.strings:
            if string.strip() == '':
                continue
            result += string.strip().translate({ord(k): None for k in digits})
        return result

    # def get_sentences(self):
    #     #TODO implement sentence parser
    #     result = []
    #
    #     page = requests.get(self.url)
    #     soup = BeautifulSoup(page.content, "html.parser")
    #     result.append(soup.text.replace('\n', ' ').replace('\r', '').replace('\t', ' '))
    #     return result

    def get_word_stats(self):
        #TODO return Counter object of the document, containing mapping {`word` -> count_in_doc}
        sentences = self.get_sentences()

        counter = Counter((x.rstrip(punctuation).lower() for y in sentences for x in y.split()))

        return counter

In [7]:
from langdetect import detect
import pythainlp
# import camel-tools

class MultilingualHtmlDocumentTextData(HtmlDocumentTextData):
    def __init__(self, url):
        HtmlDocumentTextData.__init__(self, url)

    def get_word_stats(self):
        sentences = self.get_sentences()
        # print(sentences)

        langs = ["en", "th", "ar"]

        lang = detect(sentences)
        print(lang)

        words = pythainlp.word_tokenize(sentences, keep_whitespace=False)
        counter = Counter((x.rstrip(punctuation).lower() for y in words for x in y.split()))
        return counter


### Tests

In [8]:
doc = MultilingualHtmlDocumentTextData("https://www.bangkokair.com/tha/baggage-allowance")
print(doc.get_word_stats().most_common(10))

doc = MultilingualHtmlDocumentTextData("https://alfajr-news.net/details/%D9%85%D8%B4%D8%B1%D9%88%D8%B9-%D8%AF%D9%8A%D9%85%D9%88%D9%82%D8%B1%D8%A7%D8%B7%D9%8A-%D9%81%D9%8A-%D8%A7%D9%84%D9%83%D9%88%D9%86%D8%BA%D8%B1%D8%B3-%D8%A7%D9%84%D8%A3%D9%85%D8%B1%D9%8A%D9%83%D9%8A-%D9%84%D9%85%D8%B9%D8%A7%D9%82")
print(doc.get_word_stats().most_common(10))

th
[('', 60), ('สัมภาระ', 33), ('การ', 21), ('กิโลกรัม', 21), ('ของ', 20), ('และ', 19), ('ที่', 19), ('เรา', 18), ('เที่ยวบิน', 16), ('เดินทาง', 16)]
ar
[('', 11), ('في', 3), ('وفاة', 3), ('الشيخة', 3), ('مريم', 3), ('و', 3), ('حميد', 2), ('بن', 2), ('واجب', 2), ('العزاء', 2)]
