In [496]:
import re
import json
import html_to_json

from time import time
from markdown import markdown as markdown_to_html
from markdownify import markdownify as html_to_markdown

In [204]:
# HTML Regex
regex_comment = r'<!--([\s\S]*?)-->'
regex_rendering_area = r'<script\s?(.*?)>([\s\S]*?)<\/script>'
regex_primary_multiline = r'<(head|header|form|footer)\s?(.*?)>([\s\S]*?)<\/(head|header|form|footer)>'
regex_secondary_multiline = r'<(script|noscript|applet|embed|object|param|style|aside|iframe|table|form|nav|video|map|area|canvas|svg|frame|frameset|noframes|textarea|button|select|label|fieldset|datalist|output)\s?(.*?)>([\s\S]*?)<\/(script|noscript|applet|embed|object|param|style|aside|iframe|table|form|nav|video|map|area|canvas|svg|frame|frameset|noframes|textarea|button|select|label|fieldset|datalist|output)>'
regex_primary_single_line = r'<(meta|base|basefont|link|audio|source|track|iframe|input|br|hr)\s?(.*?)\/?>'
regex_attributes_from_element = r'(id|class|style|target|rel|aria-(.*?)|data-(.*?)|cz-(.*?)|dtr-(.*?))="(.*?)"'
regex_empties_wrapper_element = r'(<(div|p|span|i|em|strong|button)><\/(div|p|span|i|em|strong|button)>)'
regex_add_spaces_on_wrapper_element = r'<(div|span)(\s?)(.*?)>([\s\S]*?)<\/(div|span)>'
regex_javascript_void_on_anchor_element = r'javascript\:void\(0\)'

In [205]:
# Markdown Regex
regex_unentered_heading = r'(\w|\W)(#{2,6})'
regex_unidentified_line = r'(=|-|\s\*){4,}'
regex_unidentified_anchor = r'!?\[\]\(.*?\)'
regex_mixed_images_and_content = r'\s+(\[?!\[)'
# regex_mixed_anchor_and_content = r'\)\s(.*?)\s?\['
regex_mixed_lists_with_content = r'\s+(\+|\*)\s(\[)'

In [206]:
# Post-processing Regex
regex_image = r'!\[(.*?)\]\((.*?)\)'
regex_anchor = r'\[(.*?)\]\((.*?)(\\\"(.*?)\\\")?\)'
regex_currency = r'(Rp|\$)\s?(\d+)([,|\.]\d{2,})([,|\.]\d{2,})?'
regex_discount = r'\d+%'
regex_unformated = r'#\d+|\[\*|\s?\*'
regex_uncondition = r'(#\d+\s?)|\]\((.*?)\)|\s?\*|\[|mmmmmmmmmmlli|\"\)|\(\"'

In [475]:
def extract_products(source: str = ''):
    # Remove unused HTML structures
    source_cleaned = re.sub(
        regex_javascript_void_on_anchor_element, '#', re.sub(
            regex_add_spaces_on_wrapper_element, '<\g<1>\g<2>\g<3>> \g<4> </\g<5>>', re.sub(
                regex_empties_wrapper_element, '', re.sub(
                    regex_attributes_from_element, '', re.sub(
                        regex_primary_single_line, '', re.sub(
                            regex_secondary_multiline, '', re.sub(
                                regex_primary_multiline, '', re.sub(
                                    regex_rendering_area, '', re.sub(
                                        regex_comment, '', source
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )

    # Remove unused Markdown structures
    source_marked = re.sub(
        regex_mixed_lists_with_content, '\\n\\n\g<2>', re.sub(
            regex_mixed_images_and_content, "\\n\\n\g<1>", re.sub(
                regex_unidentified_anchor, '', re.sub(
                    regex_unidentified_line, '', re.sub(
                        regex_unentered_heading, "\g<1>\\n\\n\g<2>", re.sub(
                            r'\s?\n\n+\s?|\s?\n\s?', ' ', html_to_markdown(source_cleaned)
                        )
                    )
                )
            )
        )
    )

    products = []
    
    for line in source_marked.split('\n'):

        line = re.sub(r'\s\s+', ' ', line.strip())

        if re.search(regex_currency, line) and '![' in line and len(line) < 750:

            bracket = {
                'title': 'N/A',
                'prices': [],
                'discounts': [],
            }

            image = re.search(regex_image, line)
            
            if image:
                bracket['title'] = image.group(1)
                bracket['image'] = {
                    'title': image.group(1),
                    'url': image.group(2)
                }
                line = re.sub(regex_image, '', line).strip()

            anchor = re.finditer(regex_anchor, line)
            
            if anchor:

                for match_index, match in enumerate(anchor, start=1):

                    if match_index > 1:
                        title_url = re.search(r'(.*?)\"(.*?)\"', match.group(2))
                        
                        if title_url:
                            bracket['description'] = title_url.group(2).strip()
                            bracket['url'] = title_url.group(1).strip()
                        else:
                            bracket['url'] = match.group(2).strip()

                        bracket['description'] = bracket['description'] + ' ' + match.group(1).strip()
                    else:
                        bracket['description'] = match.group(1).strip()
                        bracket['url'] = match.group(2).strip()

                line = re.sub(regex_anchor, '', line).strip()
            
            if line != '':
                bracket['description'] = bracket['description'] + ' ' + line
            
            bracket['description'] = re.sub(r'\s\s+', ' ', re.sub(regex_uncondition, ' ', bracket['description']).strip())

            currency = re.finditer(regex_currency, bracket['description'])

            if currency:

                for match_index, match in enumerate(currency, start=1):
                    prices = []

                    for unit in match.groups():
                        if unit:
                            prices.append(unit)

                    if len(prices) > 0:
                        bracket['prices'].append(''.join(prices))

            discount = re.finditer(regex_discount, bracket['description'])

            if discount:

                for match_index, match in enumerate(discount, start=1):
                    prices = []

                    for unit in match.groups():
                        if unit:
                            prices.append(unit)

                    if len(prices) > 0:
                        bracket['prices'].append(''.join(prices))

            products.append(bracket)    

    return ( products )

In [504]:
def extract_contents(source: str = ''):
    # Remove unused HTML structures
    source_cleaned = re.sub(
        regex_javascript_void_on_anchor_element, '#', re.sub(
            regex_add_spaces_on_wrapper_element, '<\g<1>\g<2>\g<3>> \g<4> </\g<5>>', re.sub(
                regex_empties_wrapper_element, '', re.sub(
                    regex_attributes_from_element, '', re.sub(
                        regex_primary_single_line, '', re.sub(
                            regex_secondary_multiline, '', re.sub(
                                regex_primary_multiline, '', re.sub(
                                    regex_rendering_area, '', re.sub(
                                        regex_comment, '', source
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )
    
    source_marked = re.sub(
        r'\n\n+', '\n\n', re.sub(
            r'={4,}|-{4,}|\[(.*)\]\(#(.*?)\)|^((\w.+)?#{1,6})\s|#\d+\s#{1,6}\s|(\s\*){2,}\s.+', '\n', re.sub(
                r'.+!?\[(.*?)\]\((.*?)\?(.*?)\)(.+)?', '', re.sub(
                    r'.+!?\[\]\((.*?)\)(.+)?', '', re.sub(
                        r'\)\s(.*?)\s?\[', ')\n\g<1> [', re.sub(
                            r'\s\s+', ' ', re.sub(
                                r'\n\n+', '\n\n', html_to_markdown(source_cleaned)
                            )
                        )
                    )
                )
            )
        )
    )
    
    source_marked = re.sub(r'^\n([^\:]+)\[(.*?)\]\((.*?)\)', '', source_marked, 0, re.MULTILINE)
    source_marked = re.sub(r'(\w|\.|\-)\s\[(.*?)\]\((.*?)\/tag\/(.*)\)\s', '\g<1> \g<2> ', source_marked)
    source_marked = re.sub(r'(\W|\w)\s\[', '\g<1>\n\n[', source_marked)

    links = []
    content = ''

    blockline = r'^ADVERTISEMENT|\[Selengkapnya\]'

    for line in source_marked.split('\n'):
        line = line.strip()

        if line != '':
            line = re.sub(r'ADVERTISEMENT SCROLL TO RESUME CONTENT', '', line)

            if re.search(blockline, line) == None:
                anchor = re.search(regex_anchor, line)

                if anchor:
                    if anchor.group(1):
                        links.append({
                            'title': re.sub('#{2,6}', '', anchor.group(1)).strip(),
                            'url': anchor.group(2).strip(),
                        })
                else:
                    if '##' not in line and re.search(regex_currency, line) == None and re.search(regex_discount, line) == None:
                        content += line + ' '

    return ( links, content.strip() )

In [454]:
def extract_details(source: str = ''):
    
    authors = ''
    banner = ''
    date = ''
    description = ''
    images = ''
    keywords = ''
    locale = ''
    pdf = ''
    site = ''
    url = ''
    summary = ''
    title = ''
    type = ''
    videos = ''

    return ( authors, banner, date, description, images, keywords, locale, pdf, site, summary, title, type, url, videos )

In [439]:
def extract_metadata(source: str = ''):
    
    metadata = {}
    
    return ( metadata )

In [500]:
def extract_tables(source: str = ''):
    tables = {}
    
    table_list = re.finditer(
        r'(<table\s?(.*?)>([\s\S]*?)<\/table>)', re.sub(
            regex_attributes_from_element, '', re.sub(
                regex_primary_single_line, '', re.sub(
                    regex_primary_multiline, '', re.sub(
                        regex_rendering_area, '', re.sub(
                            regex_comment, '', source
                        )
                    )
                )
            )
        )
    )

    for tableIndex, tableMatch in enumerate(table_list, start=1):
        if tableMatch:
            html_table = html_to_json.convert_tables(tableMatch.group(1))
            tables['table_' + str(tableIndex)] = html_table

    return ( tables )

In [505]:
def extract(source: str = ''):
    start = time()

    ( authors, banner, date, description, images, keywords, locale, pdf, site, summary, title, type, url, videos ) = extract_details(source)
    ( products ) = extract_products(source)
    ( links, content ) = extract_contents(source)
    ( tables ) = extract_tables(source)
    ( metadata ) = extract_metadata(source)

    finish = time()
    
    return {
        'authors': authors,
        'banner': banner,
        'content': content,
        'date': date,
        'description': description,
        'images': images,
        'keywords': keywords,
        'links': links,
        'locale': locale,
        'metadata': metadata,
        'operation': {
            'start': start,
            'finish': finish,
            'elapsed': finish - start,
        },
        'pdf': pdf,
        'products': products,
        'site': site,
        'summary': summary,
        'tables': tables,
        'title': title,
        'type': type,
        'url': url,
        'videos': videos,
    }

In [467]:
with open('./test_detik.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "Matius Alfons - detikNews Minggu, 04 Sep 2022 21:39 WIB Foto: Lumba-lumba (Dok.Instagram Menteri LHK Siti Nurbaya) **Jakarta** - Menteri Lingkungan Hidup dan Kehutanan RI Siti Nurbaya Bakar mengungkap terkait pelepasan tiga lumba-lumba hidung botol bernama Johny, Rambo, dan Rocky kembali ke habitat alaminya. Ketiga lumba-lumba itu dilepas liar setelah 7 hingga 8 tahun berada di pameran. Pelepasliaran itu disampaikan langsung oleh Siti Nurbaya melalui akun Instagram resminya @siti.nurbayabakar. Dia menyebut pelepasliaran ini pertama kalinya di dunia. \"Untuk pertama kali dilakukan di dunia, pelepasliaran lumba-lumba hidung botol bernama Johny, Rambo dan Rocky,\" kata Siti Nurbaya dalam caption Instagramnya, Minggu (4/9/2022).  Siti Nurbaya mengatakan ketiga lumba-lumba ini dibebaskan setelah 7 hingga 8 tahun berada di pameran. Ketiganya, pun sempat dilatih selama 3 tahun untuk mencari makan sendiri di alam. \"Sebelumnya setelah diselamatk

In [476]:
with open('./test_entercomputer.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "* * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * * + + + + + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [KVM (Keyboard Video/Monitor Mouse) Switches](https://enterkomputer.com/subcategory/1/accessories/156) + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * * * * ! ",
  "date": "",
  "description": "",
  "images": "",
  "keywords": "",
  "locale": "",
  "metadata": {},
  "o

In [468]:
with open('./test_instagram.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "",
  "date": "",
  "description": "",
  "images": "",
  "keywords": "",
  "locale": "",
  "metadata": {},
  "operation": {
    "start": 1662386845.768528,
    "finish": 1662386845.859267,
    "elapsed": 0.09073901176452637
  },
  "pdf": "",
  "products": [],
  "related": [
    {
      "title": "Posts",
      "url": "/owlengine/"
    },
    {
      "title": "Tagged",
      "url": "/owlengine/tagged/"
    }
  ],
  "site": "",
  "summary": "",
  "tables": {},
  "title": "",
  "type": "",
  "url": "",
  "videos": ""
}


In [469]:
with open('./test_instagram_detail.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "Web Scraping is the process of using bots to extract information and data from websites. Where basically Web Scraping extracts the code structure (HTML) on the page and retrieves the required information.In this article, we will explain how to create a Web Scraper to get information. For example, information on the World Wide Life organization website. To get information about the Main Image, a Summary of Facts, Detailed Information, and Statistics on the Orangutan species.https://owlengine.com/articles/the-basic-of-web-scraping-using-pythonFeel free to ask questions or if you have any suggestions.Thank you Edited\u00a0\u00b7\u00a03d More posts from ",
  "date": "",
  "description": "",
  "images": "",
  "keywords": "",
  "locale": "",
  "metadata": {},
  "operation": {
    "start": 1662386848.319922,
    "finish": 1662386848.397002,
    "elapsed": 0.07708001136779785
  },
  "pdf": "",
  "products": [],
  "related": [
    {
      "title"

In [501]:
with open('./test_investing.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "Breaking News Advertisement Stock Quotes By -\u00a0Sep 04, 2022 Ryan Cohen is trying to revive growth at GameStop, which has seen sales slowed as gamers have shifted to digital downloads Kroger is experiencing shoppers \u201caggressively\u201d turning to cheaper... By -\u00a0Sep 04, 2022 For the week ahead, the focus will be on Fed Chair Powell's speech and the ISM Services PMI. ExxonMobil is a buy amid fresh oil-market developments. DocuSign is set to struggle on a worsening... \") European Gas Prices Fall on Signs That Nord Stream Flows Will Resume By Investing.com News - EN](https://www.investing.com/news/commodities-news/european-gas-prices-fall-on-signs-that-nord-stream-flows-will-resume-2884958?obOrigUrl=true) Collapse Imminent For The Banks In The Eurozone? Analysis - EN](https://www.investing.com/analysis/collapse-imminent-for-the-banks-in-the-eurozone-200626756?obOrigUrl=true) Inside Sarah Sherman Samuel\u2019s Dream Home Jenni 

In [502]:
with open('./test_linkedin.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "[Front End Developer ](https://id.linkedin.com/jobs/view/front-end-developer-at-avana-3216514281?refId=AITYKPcsWXItLNH9CRLHyA%3D%3D&trackingId=dFRcFWR7h4ffJYGMJZK3ZQ%3D%3D&trk=public_jobs_topcard-title) * Slicing mockup design * Develop frontend using html, CSS, React.JS * Optimize website performance **REQUIREMENT*** At least 2 years experienced as Frontend Developer * Ability to work as a team-player * Have a big attention to details * Able to deliver lines of code with good comments * Ability to produce high quality code * Having knowledge using Gitflow is a plus * Having knowledge using Twig Editor is a plus * Having experienced working in e-commerce is a plus * Experience working with RESTful Services and APIs is a plus",
  "date": "",
  "description": "",
  "images": "",
  "keywords": "",
  "locale": "",
  "metadata": {},
  "operation": {
    "start": 1662388277.402483,
    "finish": 1662388277.506547,
    "elapsed": 0.104063987731

In [480]:
with open('./test_nytimes.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "* The U.S. and other nations have dismissed the accusations, yet the claims continue to circulate, fueling conspiracy theories.](https://www.nytimes.com/2022/09/04/technology/russia-bioweapons-geneva.html) Atul Loke for The New York Times](https://www.nytimes.com/2022/09/04/business/energy-environment/india-electric-vehicles-moped-rickshaw.html) Kayana Szymczak for The New York Times](https://www.nytimes.com/2022/09/04/science/brain-language-research.html) Timothy Mulcare for The New York Times](https://www.nytimes.com/2022/09/04/nyregion/nyc-outdoor-dining-sheds.html) options, along with some questionable claims.](https://www.nytimes.com/2022/09/03/climate/cooling-clothes-climate-change.html) Josie Norton](https://www.nytimes.com/2022/09/03/climate/cooling-clothes-climate-change.html) 2:59 Water Crisis in Jackson Is \u2018Unacceptable,\u2019 Residents Say Nailah Morgan and Christina Kelso * * Rachel Wisniewski for The New York Times](ht

In [463]:
with open('./test_petanikode.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "Home yang mampu menyimpan berbagaimacam hal. *List* biasanya digunakan untuk menyimpan koleksi data. Namun, *list* ternyata memiliki kekurangan. **Kekurangannya**: ia tidak bisa menggunakan kata kunci untuk mengakses itemnya. Hanya bisa menggunakan nomer indeks saja. Tapi tenang saja\u2026 Kekurangn ini sudah ditutipi oleh *Dictionary*. Nah, pada kesempatan ini, kita akan belajar 7 hal dasar yang harus diketahui tentang *Dictionary*. Apa saja itu? Mari kita simak\u2026 1. Apa itu Dictionary pada Python? *Dictionary* adalah stuktur data yang bentuknya seperti kamus. Ada kata **kunci** kemudian ada **nilaninya**. Kata kunci harus unik, sedangkan nilai boleh diisi denga apa saja. Contoh: ``` aku = { \"nama\" : \"Petani Kode\" , \"url:\" \"https://www.petanikode.com\" } ``` Pada contoh di atas kita membuat sebuah *Dictionary* bernama `aku` dengan isi data nama dan URL. `nama` dan `url` adalah kunci *(key)* yang akan kita gunakan untuk mengak

In [503]:
with open('./test_pricebook.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "* HP Smartphone Daftar HP Murah Terbaru September 2022 [![OPPO A31 (2020) RAM 6GB ROM 128GB](https://d1n6dbtoa2690v.cloudfront.net/images/product/M/92621_M_1.jpg)](https://www.pricebook.co.id/OPPO-A31-2020-RAM-6GB-ROM-128GB/40/PD_00092621) 2020 [#1 HP OPPO A31 (2020) [* harga 2022 [* harga RAM 4GB ROM 64GB](https://d1n6dbtoa2690v.cloudfront.net/images/product/M/96994_M_1.jpg)](https://www.pricebook.co.id/Vivo-Y21-2021-RAM-4GB-ROM-64GB/40/PD_00096994) 2021 [#3 HP Vivo Y21 (2021) 2021 [![OPPO A31 (2020) RAM 4GB ROM 128GB](https://d1n6dbtoa2690v.cloudfront.net/images/product/M/92188_M_1.jpg)](https://www.pricebook.co.id/OPPO-A31-2020-RAM-4GB-ROM-128GB/40/PD_00092188) 2020 [#5 HP OPPO A31 (2020) [* harga 2021 2021 [* harga 2022 2021 [* harga 2019 2021 [* harga 2021 2022 2021 2020 [* harga 2022 2021 2019 2021 2022 2020 [* harga 2021 [* harga 2022 [![OPPO A9 (2020) 8GB](https://d1n6dbtoa2690v.cloudfront.net/images/product/M/86689_M_1.jpg)](htt

In [461]:
with open('./test_shopee.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "",
  "date": "",
  "description": "",
  "images": "",
  "keywords": "",
  "locale": "",
  "metadata": {},
  "operation": {
    "start": 1662372660.8359628,
    "finish": 1662372661.3642519,
    "elapsed": 0.5282890796661377
  },
  "pdf": "",
  "products": [
    {
      "title": "[BEBAS PILIH SIZE] SANDAL WUDHU MASJID MUSHOLLA KANTOR UNTUK WAKAF SEDEKAH INFAQ DI MESJID MUSHOLA",
      "prices": [
        [
          "Rp",
          "6",
          ".500",
          null
        ]
      ],
      "discounts": [],
      "image": {
        "title": "[BEBAS PILIH SIZE] SANDAL WUDHU MASJID MUSHOLLA KANTOR UNTUK WAKAF SEDEKAH INFAQ DI MESJID MUSHOLA",
        "url": "https://cf.shopee.co.id/file/d82426d7d530f0efc5093354bfae2a71_tn"
      },
      "description": "BEBAS PILIH SIZE] SANDAL WUDHU MASJID MUSHOLLA KANTOR UNTUK WAKAF SEDEKAH INFAQ DI MESJID MUSHOLA Rp 6.500 10RB+ Terjual KOTA TASIKMALAYA",
      "url": "/-BEBAS-PILIH-SIZE-SANDAL-WUDHU-M

In [460]:
with open('./test_stackoverflow.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "* Viewed 1k times This question shows research effort; it is useful and clear 1 This question does not show any research effort; it is unclear or not useful 2 Bookmark this question. Show activity on this post. I have coin currency market application and I would like to integrate a graph in it, however I don't know from where data should be taken. Is there some API that I can integrate into my highcharts in order to get graphical representation of values of the coin in the past 7,14,30 days? I have scrapped all data from <http://coinmarketcap.com/> but I can't scrap graph because it's canvas. Can you help me with that? Where to search for coin data in the past? | 2 Answers 2 **Trending** sort is based off of the default sorting method \u2014 by highest score \u2014 but it boosts votes that have happened recently, helping to surface more up-to-date answers. It falls back to sorting by highest score if no posts are trending. This answer is

In [459]:
with open('./test_tokopedia.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "[Kejar Diskon Spesial Slimsure Block Fat 15s (Memblock Lemak Pada Makanan yang dikonsumsi) [Kejar Diskon Spesial FIESTA NUGGET 500 GR (2 PACK) [![(2PCS) Legrand Kaptika - Stop Kontak Schuko - 16A - Putih](https://images.tokopedia.net/img/cache/250-square/VqbcmM/2021/11/24/9d553d93-ce3b-4661-a33a-9d828d791087.jpg.webp?ect=3g)](https://www.tokopedia.com/legrandindonesia/2pcs-legrand-kaptika-stop-kontak-schuko-16a-putih?extParam=src%3Dmultiloc%26whid%3D7779493) [Kejar Diskon Spesial (2PCS) [Kejar Diskon Spesial LEGO\u00ae Speed Champions McLaren Elva (76902) [![EVERCOSS M6A (3GB) [Kejar Diskon Spesial EVERCOSS M6A (3GB) ",
  "date": "",
  "description": "",
  "images": "",
  "keywords": "",
  "locale": "",
  "metadata": {},
  "operation": {
    "start": 1662372648.7713869,
    "finish": 1662372649.279344,
    "elapsed": 0.5079572200775146
  },
  "pdf": "",
  "products": [
    {
      "title": "K Natural White Sabun Mandi Sparkling Magnolia 

In [458]:
with open('./test_wikipedia.html') as file:
    print(json.dumps(extract(file.read()), indent=2))

{
  "authors": "",
  "banner": "",
  "content": "* [Pembicaraan](/wiki/Istimewa:Pembicaraan_saya \"Pembicaraan tentang suntingan dari alamat IP ini [ctrl-option-n]\") * [Kontribusi](/wiki/Istimewa:Kontribusi_saya \"Daftar suntingan yang dibuat dari alamat IP ini [ctrl-option-y]\") * , dikenal sebagai perikarp (*pericarpium*). Perikarp ini sering berkembang lebih jauh, sehingga dapat dibedakan atas dua lapisan atau lebih. Bagian luar disebut dinding luar, eksokarp (*exocarpium*), atau epikarp (*epicarpium*); bagian dalam disebut dinding dalam atau endokarp (*endocarpium*); serta lapisan tengah (bisa beberapa lapis) yang disebut dinding tengah atau mesokarp (*mesocarpium*). Pada sebagian buah, khususnya buah tunggal yang berasal dari bakal buah tenggelam, kadang-kadang bagian-bagian bunga yang lain (umpamanya tabung perhiasan bunga, kelopak, mahkota, atau benangsari) bersatu dengan bakal buah dan turut berkembang membentuk buah. Jika bagian-bagian itu merupakan bagian utama dari buah, ma