In [254]:
!pip install tinycss lxml cssselect

Collecting lxml
  Downloading lxml-4.6.3-cp38-cp38-manylinux2014_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 13.2 MB/s eta 0:00:01
[?25hCollecting cssselect
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: lxml, cssselect
Successfully installed cssselect-1.1.0 lxml-4.6.3


In [42]:
from bs4 import BeautifulSoup, Tag
from typing import List
from collections import defaultdict

In [2]:
DATA_PTH = "../data/"

In [419]:
with open(DATA_PTH + "Raiffeisen Online.html") as f:
    html_content = f.read()

In [420]:
soup = BeautifulSoup(html_content, "html.parser")

In [790]:
header_tag_names = [f"h{header_type}" for header_type in range(1, 6 + 1)]
text_tags = header_tag_names + ["p", "span", "div"]

In [421]:
# for header_type in range(1, 6 + 1):
#     tags = soup.find_all(f"h{header_type}")
#     if tags:
#         print([tag.text for tag in tags])

['Войти в онлайн-банк', 'Мобильный банк на каждый день']
['Возможности', 'Возможности', 'Подключить мобильный банк']
['Все виды переводов', 'Управление продуктами', 'Оплата услуг', 'Онлайн сервисы', 'Все виды переводов', 'Управление продуктами', 'Оплата услуг', 'Онлайн сервисы']


In [688]:
[elem["href"] for elem in soup.find_all("a")]

['https://online.raiffeisen.ru/import/old-browsers/index.html',
 'tel:+74957755203',
 'tel:88007000072',
 'https://online.raiffeisen.ru/login/main',
 'https://www.raiffeisen.ru/',
 'https://online.raiffeisen.ru/login/main',
 'https://online.raiffeisen.ru/login/tariff',
 'https://online.raiffeisen.ru/login/qa',
 'https://online.raiffeisen.ru/demo/',
 'https://online.raiffeisen.ru/login/main',
 'https://www.raiffeisen.ru/retail/cards/',
 'https://online.raiffeisen.ru/login/tariff',
 'https://online.raiffeisen.ru/login/qa',
 'https://online.raiffeisen.ru/demo/',
 'https://online.raiffeisen.ru/login/main',
 'tel:+74957755203',
 'tel:88007000072',
 'https://online.raiffeisen.ru/login/connect',
 'https://online.raiffeisen.ru/login/restore',
 'https://www.raiffeisen.ru/retail/remote_service/connect/?active_tab=tab-4',
 'https://apps.apple.com/ru/app/%D1%80%D0%B0%D0%B9%D1%84%D1%84%D0%B0%D0%B9%D0%B7%D0%B5%D0%BD-%D0%BE%D0%BD%D0%BB%D0%B0%D0%B9%D0%BD-%D0%B1%D0%B0%D0%BD%D0%BA-%D1%80%D0%BE%D1%81%D1%

In [916]:
class Elem: 
    def __eq__(self, other):
        return isinstance(self, other.__class__) or isinstance(other, self.__class__)

In [917]:
class TextElem(Elem):
    def __init__(self, tag: Tag):
        self.category = "text"
        self.tag = tag
        
        self.text = tag.string
        self.text_eq = "".join([line.strip() for line in self.text.split()])
        
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
#         print("eq of texts", self)
#         print("other", other)
#         print(self.text == other.text)
        return self.text_eq == other.text_eq
    
    def __repr__(self):
        return f"TextElem(text: {self.text})"

In [918]:
class ImgElem(Elem):
    def __init__(self, tag: Tag):
        self.category = "image"
#         print("create img elem", tag)
        self.tag = tag
        if self.tag.has_attr("class"):
            self.tag_class = self.tag["class"]
        else:
            self.tag_class = None
        
        self.is_picture = tag.name in ["svg", "canvas"]
        self.url = None
        if not self.is_picture:
            self.url = tag["style"][tag["style"].find("url(") + 5: -3]
            
    def __repr__(self):
        return f"ImgElem(is_picture: {self.is_picture}, url: {self.url}, class: {self.tag_class})"
            
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        if self.is_picture:
            if self.tag.has_attr("class") and other.tag.has_attr("class"):
                return self.tag["class"] == other.tag["class"]
            else:
                return True
#                 # both don't have classes
#                 return ("class" not in self.tag) and ("class" not in other.tag)
                
        else:
            return self.url == other.url

In [919]:
class ButtonElem(Elem):
    def __init__(self, tag: Tag):
        self.category = "button"
        self.tag = tag
        
        self.action = None
        
        if self.tag.has_attr("href"):
            self.href = self.tag["href"]
        else:
            self.href = None
            
        if self.tag.name == "button":
            self.action = self.tag.text.strip()
        else:
#             print("beb", tag)
            if self.tag.has_attr("onclick"):
                self.action = self.tag["onclick"]
            else:
                self.action = self.tag["href"]
                
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
#         print("Compare", self.action, other.action)
        return self.action == other.action
    
    def __repr__(self):
        return f"ButtonElem(action: {self.action})"

In [920]:
class InputElem(Elem):
    def __init__(self, tag: Tag):
        self.category = "input"
        self.tag = tag
        self.placeholder = self.tag["placeholder"] if self.tag.has_attr("placeholder") else None
        
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return self.placeholder == other.placeholder or (self.placeholder is None and other.placeholder is None)
    
    def __repr__(self):
        return f"InputElem(placeholder: {self.placeholder})"

In [921]:
def maybe_create_obj(tag: Tag):
    
    if not isinstance(tag, Tag):
        return
    is_text = tag.name in text_tags and tag.string
    if is_text:
        return TextElem(tag)
    
    is_img = False
    try:
        is_img = "background-image" in tag["style"]
    except:
        pass
    is_img = is_img or tag.name in ["svg", "canvas"]
    
    if is_img:
        return ImgElem(tag)
        
    is_button = tag.name == "button"
    if not is_button:
        if tag.has_attr("href") and tag.name not in ["link"]:
            is_button = is_button or tag["href"] is not None
        if tag.has_attr("onclick"):
            is_button = is_button or tag["onclick"] is not None
    if is_button:
#         print("button", tag)
        return ButtonElem(tag)
    
    is_input = tag.name == "input"
    if is_input:
        return InputElem(tag)

def soup_dfs(tags: List[Tag], valuable_elements):
    for tag in tags:
        obj = maybe_create_obj(tag)
        if obj is not None:
            valuable_elements.append(obj)
                    
        if hasattr(tag, "children"):
            soup_dfs(list(tag.children), valuable_elements)

In [922]:
text_tags

['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'div']

In [923]:
# for obj in uniq_elems:
# #     try:
#     print(obj)
# #     except KeyError:
# #         pass

In [924]:
def get_elems_repr(pth):
    with open(pth) as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    
    valuable_elements = []
    soup_dfs(soup.children, valuable_elements)
    
    uniq_elems = []
    for elem in valuable_elements:
        if elem not in uniq_elems:
            uniq_elems.append(elem)
            
    return uniq_elems


In [1004]:
reprs = {}

name2path = {"alpha": DATA_PTH + "Интернет-банк «Альфа-Клик».html",
             "sber": DATA_PTH + "Вход - СберБанк Онлайн.html",
             "vk": DATA_PTH + "Sign in _ VK.html",
             "raif": DATA_PTH + "Raiffeisen Online.html",
             "mts": DATA_PTH + "Введите номер телефона МТС.html"
            }

for name, path in name2path.items():
    reprs[name] = get_elems_repr(path)

In [1005]:
mob_name2path = {"alpha_mob": DATA_PTH + "alpha_mobile.html",
                  "sber_mob": DATA_PTH + "sber_mobile.html",
                  "vk_mob": DATA_PTH + "vk_mobile.html",
                  "raif_mob": DATA_PTH + "raif_mobile.html",
                  "mts_mob": DATA_PTH + "mts_mobile.html"
                 }

for name, path in mob_name2path.items():
    reprs[name] = get_elems_repr(path)

## Metrics of similarity

In [1006]:
def iou(lst1, lst2):
    intersected = 0
    
    for elem in lst1:
        if elem in lst2:
            intersected += 1
            
    return intersected / (len(lst1) + len(lst2) - intersected)

In [1007]:
iou(reprs["sber"], reprs["sber_mob"])

0.9491525423728814

In [1008]:
def get_hrefs(elems: List[Elem]):
    hrefs = []
    for elem in elems:
        if hasattr(elem, "href"):
            if elem.href is not None:
                hrefs.append(elem.href)
    return hrefs

In [1009]:
def hrefs_iou(repr1: List[Elem], repr2: List[Elem]):
    return iou(get_hrefs(repr1), get_hrefs(repr2))

In [1017]:
def intersection_inversions_index(repr1: List[Elem], repr2: List[Elem]):
    inter_idxs1, inter_idxs2 = [], []
    
    for idx1, elem in enumerate(repr1):
        if elem in repr2:
            inter_idxs1.append(idx1)
            inter_idxs2.append(repr2.index(elem))
            
#     print(inter_idxs1, inter_idxs2)
    # cnt inversions
    nb_invs = 0
    nb_pairs = 0
    for idx, position in enumerate(inter_idxs1):
        for position2 in inter_idxs2[idx + 1:]:
            if position2 < position:
                nb_invs += 1
            nb_pairs += 1
    
    if nb_pairs == 0:
        return 1 # biggest value because have no any order between intersecting orders
                
    return nb_invs / nb_pairs
        

In [1018]:
from collections import defaultdict

def component_types_mismatch(repr1, repr2):
    categ2cnt = defaultdict(int)
    
    for elem in repr1:
        categ2cnt[elem.category] += 1
    for elem in repr2:
        categ2cnt[elem.category] -= 1
        
    return sum([abs(val) for val in categ2cnt.values()]) / (len(repr1) + len(repr2))

In [1019]:
from collections import Counter

def words_similarity(repr1, repr2):
    texts1 = []
    for elem in repr1:
        if elem.category == "text":
            texts1.append(elem.text)
    texts2 = []
    for elem in repr2:
        if elem.category == "text":
            texts2.append(elem.text)
    
    cnter1 = Counter(" ".join(texts1).split())
    cnter2 = Counter(" ".join(texts2).split())
    sum1 = sum(cnter1.values())
    sum2 = sum(cnter2.values())
    
    # normalization
    for key, value in cnter1.items():
        cnter1[key] = value / sum1
    for key, value in cnter2.items():
        cnter2[key] = value / sum2
    
#     print(cnter1, "\n", cnter2)
    
    return sum((cnter1 & cnter2).values()) / sum((cnter1 | cnter2).values())

In [1013]:
hrefs_iou(reprs["raif"], reprs["raif_mob"])

0.95

In [1014]:
intersection_inversions_index(reprs["sber"], reprs["raif"])

1

In [1015]:
component_types_mismatch(reprs["alpha"], reprs["vk"])

0.36

In [1016]:
words_similarity(reprs["raif"], reprs["raif_mob"])

0.9809531687847001

## Distinguish websites (fit linear regression)

In [999]:
# calc metrics and labels for pairs
# fit linear regression, save weights, use in script

In [1023]:
samples = []

for name1, repr1 in reprs.items():
    for name2, repr2 in reprs.items():
        if name1 == name2:
            continue
        x = [
            iou(repr1, repr2),
            hrefs_iou(repr1, repr2),
            intersection_inversions_index(repr1, repr2),
            component_types_mismatch(repr1, repr2),
            words_similarity(repr1, repr2)
        ]
        y = name1 == name2 + "_mob" or name2 == name1 + "_mob"
        samples.append((x, y))

In [1025]:
x, y = zip(*samples)

In [1029]:
from sklearn.linear_model import LinearRegression

In [1036]:
linreg = LinearRegression().fit(x, y)

In [1039]:
linreg.coef_, linreg.intercept_

(array([ 1.23886136, -0.32735346,  0.00480269, -0.26022503,  0.22453727]),
 0.10448863512832088)

In [1045]:
import json
weights = dict(zip(("iou", "hrefs_iou", "inversions_index", "comp_mismatch", "words"), linreg.coef_))

weights["bias"] = linreg.intercept_

with open("../fitted_lr_weights.json", "w") as f:
    json.dump(weights, f)

## Generating new html files

In [1073]:
with open(DATA_PTH + "Вход - СберБанк Онлайн.html") as f:
    html_content = f.read()
soup = BeautifulSoup(html_content, "html.parser")

valuable_elements = []
soup_dfs(soup.children, valuable_elements)

uniq_elems = []
for elem in valuable_elements:
    if elem not in uniq_elems:
        uniq_elems.append(elem)


In [1074]:
parent = uniq_elems[0].tag.parent

In [1075]:
parent.append(html_rep[0].tag)

In [1076]:
with open("../data/changed_html.html", "w") as f:
    f.write(str(soup))