# Loading HTML page

In [35]:
import requests
from unstructured.partition.html import partition_html

def load_url(url):

    try:
        response = requests.get(url)

        if response.status_code == 200:
            html_content = response.text
            doc_elements = partition_html(text=html_content)
            print("Successfully fetched content!")
            return html_content, doc_elements
    except Exception as e:
        print(f"Error while fetching HTML content from URL {e}: {response.status_code}")

url = "https://www.daraz.com.np/"
html, partition = load_url(url)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Successfully fetched content!


# Data Ingestion and Preprocessing HTML Content

In [36]:
html

'\r\n\r\n\r\n\r\n\n\n\n      \n        \n      \n    \n\n\n\n\n\n\n\n  \n\n  \n\n  \n\n\n\n  \n  \n\n\n\n\n  \n\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  \n  \n  \n    \n  \n  \n    \n  \n\n\n\n\n\n\n  \n\n  \n\n\n\n\n\n\n\n\n\n\n\n  \n\n\n\n\n\n\n  \n\n\n\n\n\n  \n  <!DOCTYPE HTML>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n \n\n\n \n\n  \n    \n  \n\n  \n    \n  \n\n  \n    \n  \n\n  \n    \n  \n\n  \n    \n  \n\n\n\n\n  \n \n\n  \n    \n  \n\n  \n    \n  \n\n  \n    \n  \n\n  \n    \n  \n\n  \n    \n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n\n\n\n\n\n\n\n\n\n  \n\n<html lang="en">\n<head>\n  <meta charset="utf-8">\n  <meta name="spm-id" content="a2a0e.11779170">\n  <meta name="keywords" content="">\n  <meta name="description" content="Looking for Online Shopping Nepal? Daraz.com.np online shopping site is one of the largest online shopping store in Kathmandu, Biratnagar across the country - Shop Now! ✓FREE Delivery">\n  \n  \n  \n  \n<meta property="og:url" content="https://www.daraz.com.n

In [38]:
partition

[<unstructured.documents.html.HTMLTitle at 0x201a9d37ad0>,
 <unstructured.documents.html.HTMLTitle at 0x201aaa6c2c0>,
 <unstructured.documents.html.HTMLTitle at 0x201aaa6c770>,
 <unstructured.documents.html.HTMLListItem at 0x201aaa6c7d0>,
 <unstructured.documents.html.HTMLListItem at 0x201aaa6c800>,
 <unstructured.documents.html.HTMLListItem at 0x201aaa6c980>,
 <unstructured.documents.html.HTMLListItem at 0x201aaa6ca10>,
 <unstructured.documents.html.HTMLListItem at 0x201aaa6cad0>,
 <unstructured.documents.html.HTMLListItem at 0x201aaa6cb60>,
 <unstructured.documents.html.HTMLTitle at 0x201aaa6cdd0>,
 <unstructured.documents.html.HTMLTitle at 0x201aaa6d130>,
 <unstructured.documents.html.HTMLTitle at 0x201aaa6d550>,
 <unstructured.documents.html.HTMLNarrativeText at 0x201943fb200>,
 <unstructured.documents.html.HTMLTitle at 0x201add3bb90>,
 <unstructured.documents.html.HTMLListItem at 0x201add3bce0>,
 <unstructured.documents.html.HTMLListItem at 0x201add3bda0>,
 <unstructured.documents

In [41]:
group = {
    'metadata': {
        'source': url,
    },
    'page_content': ''
}

all_groups = []

for element in partition:
    if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
        if group['page_content']:
            all_groups.append(group)
            group = {
                'metadata': {
                    'source': url,
                },
                'page_content': ''
            }
        group['page_content'] += element.text
    elif 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
        group['page_content'] += '. '+element.text 
    elif 'unstructured.documents.html.HTMLListItem' in str(type(element)):
        group['page_content'] += '\n'+element.text
    else:
        group['page_content'] += '\n\n'+element.text

if group['page_content']:
    all_groups.append(group)

for group in all_groups[:]:
    print(group)

{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Become a Seller'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Payments & Recharge'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Help & Support\nHelp Center\nChat with Us\nOrder\nShipping & Delivery\nPayment\nReturns & Refunds'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Daraz Logistics Partner'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Save More on App'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Download the App. Success! Please check your phone for the download link'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Login\n|\nSign Up\nManage My Account\nMy Orders\nMy Wishlist & Followed Stores\nMy Reviews\nMy Returns & Cancellations\nLogout\n\nen\n                  ⌄'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'page_content': 'Select Language'

In [1]:
# Direct partitioning
from unstructured.partition.html import partition_html

url = "https://www.daraz.com.np/"
partition = partition_html(url=url)

In [2]:
group = {
    'metadata': {
        'source': url,
    },
    'tag': '',
    'page_content': ''
}

title_groups = []

for element in partition:
    if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
        if group['page_content']:
            title_groups.append(group)
            group = {
                'metadata': {
                    'source': url,
                },
                'tag': 'title',
                'page_content': ''
            }
        group['page_content'] += element.text

if group['page_content']:
    title_groups.append(group)

for group in title_groups[:]:
    print(group)

{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': '', 'page_content': 'Become a Seller'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Payments & Recharge'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Help & Support'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Daraz Logistics Partner'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Save More on App'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Download the App'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Login'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'Select Language'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'title', 'page_content': 'EN / English'}
{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag':

In [3]:
narr_txt = []

for element in partition:
    if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
        group['tag'] = 'Narrative Text'
        group['page_content'] += '. '+element.text 

if group['page_content']:
    narr_txt.append(group)

for group in narr_txt[:]:
    print(group)

{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': 'Narrative Text', 'page_content': "Stealth 13. Success! Please check your phone for the download link. E-commerce has evolved over the past few years and since it’s easier and more convenient, it is evident that customers are actually switching to the trend of online shopping. Daraz, the Nepali shopping store, brings a whole new concept by showcasing a number of famous brands under one roof. Not only does it fulfill clothing necessities of both men and women but you can also shop for all kinds of appliances like air conditioners, heaters, refrigerators, LED TVs and a lot more. Simply select your favorite brand like Samsung, Apple, HP, Huawei, Dell, Canon, Nikon, etc and get yourself the best electronic items.. Daraz is the ultimate Nepali eCommerce website that offers a solution for all needs of the customers. It has a wide and assorted range of products including clothing, electronics, mobile phones, home and living, health 

In [6]:
group = {
    'metadata': {
        'source': url,
    },
    'tag': '',
    'page_content': ''
}

ls_items = []

for element in partition:
    if 'unstructured.documents.html.HTMLListItem' in str(type(element)):
        group['tag'] = "Item's list"
        group['page_content'] += '| '+element.text

if group['page_content']:
    ls_items.append(group)
for group in ls_items[:]:
    print(group)

{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': "Item's list", 'page_content': "| Help Center| Chat with Us| Order| Shipping & Delivery| Payment| Returns & Refunds| || Sign Up| Manage My Account| My Orders| My Wishlist & Followed Stores| My Reviews| My Returns & Cancellations| Logout| Women's Fashion| Health & Beauty| Men's Fashion| Watches & Accessories| Electronic Devices| TV & Home Appliances| Electronic Accessories| Groceries & Pets| Babies & Toys| Home & Lifestyle| Sports & Outdoor| Motors, Tools & DIY| Clothing\n                \n                \n                    \n                    \n                        \n                            \n                                \n                                    Clothing\n                                \n                            \n                            \n                            \n                                \n                                    \n                                        \n         

In [7]:
group = {
    'metadata': {
        'source': url,
    },
    'tag': '',
    'page_content': ''
}

txt = []

for element in partition:
    if 'unstructured.documents.html.HTMLText' in str(type(element)):
        group['tag'] = "Item's list"
        group['page_content'] += '| '+element.text

if group['page_content']:
    txt.append(group)
for group in txt[:]:
    print(group)

{'metadata': {'source': 'https://www.daraz.com.np/'}, 'tag': "Item's list", 'page_content': "| en\n                  ⌄| \ue76c\n                \n                    \n                        Proudly Nepali| \ue76c\n                \n                    \n                        Free Delivery| \ue7f9\n                \n                Fashion| \ue7f9\n                \n                Beauty| \ue768\n                \n                    \n                        Home & Decor| Daraz 11.11 Sale\n          \n            ,| 12.12 Sale\n          \n            ,| Dashain Dhamaka\n          \n            ,| Mobile Week\n          \n            ,| Valentine's Day Sale\n          \n            ,| Cricket Streaming\n          \n            ,| Grocery Shopping in Nepal\n          \n            ,| Realme GT 2 Pro\n          \n            ,| Samsung Mobile Phones\n          ,| Xiaomi Mobiles\n          ,| Nokia Mobiles\n          ,| Oppo\xa0Mobiles\n          ,| Apple iPhones\n          ,| OnePlu