## Required Function for feature extraction:

In [39]:
import urllib.request
from bs4 import BeautifulSoup

BASE_URL = 'https://www.mega.pk/'

def get_image(new_soup):
    try:
        img = new_soup.find('img', attrs={'class': 'img-responsive'})
        if img and 'src' in img.attrs:
            return urllib.request.urljoin(BASE_URL, img['src'])
        else:
            return 'N/A'
    except Exception as e:
        print(f"[get_image] Error: {e}")
        return 'N/A'

def get_title(new_soup):
    try:
        title = new_soup.find('h2', attrs={'class': 'product-title'})
        return title.text.strip() if title else 'N/A'
    except Exception as e:
        print(f"[get_title] Error: {e}")
        return 'N/A'

import re

def get_price(new_soup):
    try:
        price_tag = new_soup.find('span', attrs={'id': 'price'})
        if price_tag:
            raw_price = price_tag.text.strip()
            digits = re.findall(r'\d+', raw_price)
            if digits:
                price_str = ''.join(digits)
                return int(price_str)
        return 'N/A'
    except Exception as e:
        print(f"[get_price] Error: {e}")
        return 'N/A'

def get_category(url):
    try:
        path = urllib.request.urlparse(url).path
        parts = path.strip("/").split("/")
        return parts[0] if parts else "N/A"
    except Exception as e:
        print(f"[get_category] Error: {e}")
        return "N/A"

def get_soup(url):
    try:
        req = urllib.request.Request(url, headers=headers)
        html = urllib.request.urlopen(req).read()
        return BeautifulSoup(html, "html.parser")
    except Exception as e:
        print(f"[get_soup] Error loading {url}: {e}")
        return 'N/A'


# Main Code:

In [41]:
import urllib.request
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re
import json
import time

base_url = 'https://www.mega.pk/'
headers = {'User-Agent': 'Mozilla/5.0'}
all_products = []
homepage_soup = get_soup(base_url)
category_links = []

if homepage_soup:
    menu_items = homepage_soup.find_all('li', attrs={'class': 'dropdown yamm-fw'})
    for item in menu_items:
        a_tag = item.find('a')
        if a_tag and 'href' in a_tag.attrs:
            full_url = urljoin(base_url, a_tag['href'])
            category_links.append(full_url)

for category_url in category_links:
    page = 1
    while True:
        paged_url = f"{category_url}{page}/" if not category_url.endswith('/') else f"{category_url}{page}/"
        print(f"\n Scanning: {paged_url}")
        soup = get_soup(paged_url)
        if not soup:
            break

        product_boxes = soup.find_all('div', attrs={'class': 'lap_thu_box bg-color-white'})
        if not product_boxes:
            break

        product_links = []
        for box in product_boxes:
            a = box.find('a')
            if a and 'href' in a.attrs:
                product_links.append(urljoin(base_url, a['href']))

        if not product_links:
            break
        for product_url in product_links:
            print(f"Product: {product_url}")
            product_soup = get_soup(product_url)
            if product_soup is None:
                continue

            product_data = {
                'category': get_category(category_url),
                'image_url': get_image(product_soup),
                'title': get_title(product_soup),
                'price': get_price(product_soup),
                'url': product_url
            }

            all_products.append(product_data)
            time.sleep(0.5)

        page += 1
        time.sleep(1)

with open('mega_products.json', 'w', encoding='utf-8') as f:
    json.dump(all_products, f, indent=4, ensure_ascii=False)

print(f"\nDone! {len(all_products)} products saved to mega_products.json")



 Scanning: https://www.mega.pk/laptop-price-pakistan/1/
Product: https://www.mega.pk/laptop_products/26644/ASUS-ROG-Strix-G16-Core-i7-13th-Generation-16GB-Ram-512GB-SSD-8GB-NVIDIA-RTX4070-Windows-11.html
Product: https://www.mega.pk/laptop_products/26608/HP-Victus-16-R0328TX-Core-i7-13th-Generation-16GB-RAM-512GB-SSD-8GB-RTX-4060-DOS.html
Product: https://www.mega.pk/laptop_products/26584/Apple-MacBook-Air-13-MC6C4-M4-Chip-10-Core-CPU-10-Core-GPU-24GB-Ram-512GB-SSD-Midnight.html
Product: https://www.mega.pk/laptop_products/26583/Apple-MacBook-Air-13-MC6T4-M4-Chip-10-Core-CPU-8-Core-GPU-16GB-Ram-256GB-SSD-Sky-Blue.html
Product: https://www.mega.pk/laptop_products/26582/Apple-MacBook-Air-13-MW133-M4-Chip-10-Core-CPU-10-Core-GPU-16GB-Ram-512GB-SSD-Midnight.html
Product: https://www.mega.pk/laptop_products/26581/Apple-MacBook-Air-13-MW123-M4-Chip-10-Core-CPU-8-Core-GPU-16GB-Ram-256GB-SSD-Midnight.html
Product: https://www.mega.pk/laptop_products/26580/Apple-MacBook-Air-13-MW0Y3-M4-Chip-10