# Required Function for feature extraction:

In [9]:
import urllib.request
from bs4 import BeautifulSoup
from collections import defaultdict
import time



def get_title(new_soup):
    try:
      title = new_soup.find("span",attrs={"class": "base", "data-ui-id":"page-title-wrapper","itemprop": "name"}).text.strip()
      if title :
        return title
      else:

         return "N/A"
    except Exception as e:
        print(f"Error in get_title: {e}")
        return "N/A"



def get_price(new_soup):
    try:
        price_element = new_soup.find("span", class_="price")
        if price_element:
            return price_element.text.strip()
        else:
            return "N/A"
    except Exception as e:
        print(f"Error in get_price: {e}")
        return "N/A"





def get_sku(new_soup):
  try:
     sku = new_soup.find("div",attrs={"class": "value","itemprop":"sku"}).text.strip()
     if sku:
      return sku
     else:
      return 'N/A'
  except Exception as e:
      print(f"Error in get_sku: {e}")
      return "N/A"
  return sku

def get_subcat(new_soup):
  try:
    sub_cat = new_soup.find('div',attrs={'class': 'cat-links'}).find('a').text.strip()
    if sub_cat:
      return sub_cat
    else:
      return 'N/A'
  except Exception as e:
      print(f"Error in get_subcat: {e}")
      return "N/A"


def get_all_features(soup):
    specs = {}
    try:
        table = soup.find("table", class_="data table additional-attributes")
        if table:
            rows = table.find_all("tr")
            for row in rows:
                key = row.find("th").text.strip().lower().replace(" ", "_")
                value = row.find("td").text.strip()
                specs[key] = value
    except Exception as e:
        print(f"[Spec error] {e}")
    return specs

# Main Code:

In [10]:
import urllib.request
from bs4 import BeautifulSoup
import json
import time
import csv


url = 'https://www.shophive.com/'
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(req).read()
soup = BeautifulSoup(html, "html.parser")

menu = soup.find("ul", class_="smartmenu magebig-nav")
category_links = {}

top_items = menu.find_all("li", class_="level0")
for item in top_items:
    category_tag = item.find("a", class_="level-top")
    if category_tag and category_tag.has_attr("href"):
        try:
            category_name = category_tag.find("span").contents[0].strip()
        except:
            category_name = category_tag.get_text(strip=True)
        category_href = category_tag["href"].strip()
        if category_href.startswith("http"):
            category_links[category_name] = category_href


data = []
visited_urls = set()

for category, base_url in category_links.items():
    print(f"\nCategory: {category}")
    page = 1

    while True:
        paged_url = f"{base_url}?p={page}"
        print(f"  Page {page}: {paged_url}")
        try:
            req = urllib.request.Request(paged_url, headers=headers)
            html = urllib.request.urlopen(req, timeout=10).read()
            page_soup = BeautifulSoup(html, "html.parser")
            product_tags = page_soup.find_all("a", class_="product-item-link")

            if not product_tags:
                print("  No more products.")
                break

            for tag in product_tags:
                try:
                    product_url = tag['href']
                    if product_url in visited_urls:
                        continue
                    visited_urls.add(product_url)

                    req = urllib.request.Request(product_url, headers=headers)
                    product_html = urllib.request.urlopen(req).read()
                    new_soup = BeautifulSoup(product_html, "html.parser")

                    time.sleep(1.5)

                    product = {
                        "category": category,
                        "subcategory": get_subcat(new_soup),
                        "title": get_title(new_soup),
                        "price": get_price(new_soup),
                        "url": product_url,
                        "sku": get_sku(new_soup)

                    }
                    specs = get_all_features(new_soup)
                    product.update(specs)

                    print(f"     - Product: {product['title']}")
                    data.append(product)

                except Exception as inner_e:
                    print(f"  [Product Error]: {inner_e}")
                    continue

            page += 1
            time.sleep(2.5)

        except Exception as e:
            print(f"  [Page Error]: {e}")
            break

    time.sleep(2)


all_keys = set()
for item in data:
    all_keys.update(item.keys())
fieldnames = sorted(list(all_keys))



with open("products.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print("Saved to 'products.json'")




Category: Apple
  Page 1: https://www.shophive.com/apple?p=1
     - Product: Apple Vision Pro 512GB
     - Product: Apple iTunes Gift Card 15$
     - Product: Apple iTunes Gift Card 50$
     - Product: Apple iTunes Gift Card 25$
     - Product: Apple iTunes Gift Card 100$
     - Product: Apple 60W MagSafe 2 Power Adapter for MacBook Pro (MD565B)
     - Product: Apple 60W MagSafe 1 Power Adapter for MacBook and 13-inch MacBook Pro MC461B
     - Product: Apple iTunes Gift Card 10$
  [Product Error]: HTTP Error 500: Internal Server Error
     - Product: Apple 45W MagSafe 2 Power Adapter for MacBook Air MD592LL
     - Product: Apple Silicone Case for 9.7-inch iPad Pro - Charcoal Gray
     - Product: Mi Type-C to Lightning Cable 1m
     - Product: Apple Magic Keyboard for iPad Pro 11 inch 3rd generation and iPad Air 4th generation, 5th Gen US English
     - Product: Apple TV 3rd Generation 64GB 4K Wifi (MN873LL)
     - Product: Apple iMac M1 Chip 24 inch 8GB 256GB SSD
     - Product: Apple