In [254]:
!pip install tinycss lxml cssselect

Collecting lxml
  Downloading lxml-4.6.3-cp38-cp38-manylinux2014_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 13.2 MB/s eta 0:00:01
[?25hCollecting cssselect
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: lxml, cssselect
Successfully installed cssselect-1.1.0 lxml-4.6.3


In [42]:
from bs4 import BeautifulSoup, Tag
from typing import List
from collections import defaultdict

In [2]:
DATA_PTH = "../data/"

In [419]:
with open(DATA_PTH + "Raiffeisen Online.html") as f:
    html_content = f.read()

In [420]:
soup = BeautifulSoup(html_content, "html.parser")

In [421]:
header_tag_names = [f"h{header_type}" for header_type in range(1, 6 + 1)]
text_tags = header_tag_names + ["div"]

for header_type in range(1, 6 + 1):
    tags = soup.find_all(f"h{header_type}")
    if tags:
        print([tag.text for tag in tags])

['Войти в онлайн-банк', 'Мобильный банк на каждый день']
['Возможности', 'Возможности', 'Подключить мобильный банк']
['Все виды переводов', 'Управление продуктами', 'Оплата услуг', 'Онлайн сервисы', 'Все виды переводов', 'Управление продуктами', 'Оплата услуг', 'Онлайн сервисы']


In [443]:
soup.find_all("input")[0]["placeholder"]

'Логин'

In [390]:
style[style.find("url(") + 5: -3]

'https://cms-res.online.sberbank.ru/PRELOGINBANNERS/images/default/slide1.jpg'

In [279]:
for child in soup:
    if hasattr(child, "name"):
        print(child.name)

None
None
None
None
html


In [497]:
from uuid import uuid4

In [519]:
class Elem:
    def __eq__(self, other):
        return isinstance(self, other.__class__) or isinstance(other, self.__class__)

In [627]:
class TextElem(Elem):
    def __init__(self, tag: Tag):
        self.tag = tag
        
        self.text = tag.string
        self.text_eq = "".join([line.strip() for line in self.text.split()])
        
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
#         print("eq of texts", self)
#         print("other", other)
#         print(self.text == other.text)
        return self.text_eq == other.text_eq
    
    def __repr__(self):
        return f"TextElem(text: {self.text})"

In [637]:
class ImgElem(Elem):
    def __init__(self, tag: Tag):
#         print("create img elem", tag)
        self.tag = tag
        if "class" in self.tag:
            self.tag_class = self.tag["class"]
        else:
            self.tag_class = None
        
        self.is_picture = tag.name in ["svg", "canvas"]
        self.url = None
        if not self.is_picture:
            self.url = tag["style"][tag["style"].find("url(") + 5: -3]
            
    def __repr__(self):
        return f"ImgElem(is_picture: {self.is_picture}, url: {self.url}, class: {self.tag_class})"
            
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        if self.is_picture:
            if "class" in self.tag and "class" in other.tag:
                return self.tag["class"] == other.tag["class"]
            else:
                return True
#                 # both don't have classes
#                 return ("class" not in self.tag) and ("class" not in other.tag)
                
        else:
            return self.url == other.url

In [638]:
class ButtonElem(Elem):
    def __init__(self, tag: Tag):
        self.tag = tag
        
        self.action = None
        if self.tag.name == "button":
            self.action = self.tag["class"]
        else:
            if self.tag.href is not None:
                self.action = self.tag.href
            else:
                self.action = self.tag.onclick
                
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return self.action == other.action
    
    def __repr__(self):
        return f"ButtonElem(action: {self.action})"

In [639]:
class InputElem(Elem):
    def __init__(self, tag: Tag):
        self.tag = tag
        self.placeholder = self.tag["placeholder"]
        
    def __eq__(self, other):
        if not super().__eq__(other):
            return False
        return self.placeholder == other.placeholder
    
    def __repr__(self):
        return f"InputElem(placeholder: {self.placeholder})"

In [640]:
valuable_elements = []

def maybe_create_obj(tag: Tag):
    is_text = tag.name in header_tag_names + ["p", "span", "div"] and tag.string
    if is_text:
        return TextElem(tag)
    
    is_img = False
    try:
        is_img = "background-image" in tag["style"]
    except:
        pass
    is_img = is_img or tag.name in ["svg", "canvas"]
    
    if is_img:
        return ImgElem(tag)
        
    is_button = tag.name == "button"
    if not is_button:
        if hasattr(tag, "href"):
            is_button = is_button or tag.href is not None
        if hasattr(tag, "onclick"):
            is_button = is_button or tag.onclick is not None
    if is_button:
        return ButtonElem(tag)
    is_input = tag.name == "input"
    if is_input:
        return InputElem(tag)

def soup_dfs(tags: List[Tag]):
    for tag in tags:
        obj = maybe_create_obj(tag)
        if obj is not None:
            valuable_elements.append(obj)
                    
        if hasattr(tag, "children"):
            soup_dfs(list(tag.children))

In [641]:
soup_dfs(soup.children)

In [642]:
uniq_elems = []
for elem in valuable_elements:
    if elem not in uniq_elems:
        uniq_elems.append(elem)

In [643]:
for obj in valuable_elements:
#     try:
    print(obj)
#     except KeyError:
#         pass

TextElem(text: Обновите версию Вашего браузера)
TextElem(text: В 2021 году мы перестанем поддерживать устаревшие версии браузеров (например, Internet Explorer 9 и старше). Мы делаем это для безопасности и удобства клиентов.)
TextElem(text: Обновите, пожалуйста, Ваш браузер до последней версии или установите один из современных браузеров, чтобы продолжать пользоваться интернет-банком Райффайзен-Онлайн.)
TextElem(text: Браузер не поддерживает JavaScript)
TextElem(text: Москва)
TextElem(text: +7 (495) 775-52-03)
TextElem(text: для звонков из других регионов России)
TextElem(text: 8 (800) 700-00-72)
ImgElem(is_picture: True, url: None, class: None)
ImgElem(is_picture: True, url: None, class: None)
TextElem(text: RU)
ImgElem(is_picture: True, url: None, class: None)
ImgElem(is_picture: True, url: None, class: None)
TextElem(text: RU)
ImgElem(is_picture: True, url: None, class: None)
TextElem(text: EN)
ImgElem(is_picture: True, url: None, class: None)
ImgElem(is_picture: True, url: None, cla

In [636]:
tag["style"] = "color: red;"

TypeError: 'TextElem' object does not support item assignment