In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Load category links
df = pd.read_excel("Category_links.xlsx")

# Define headers
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Lists to store data
data = []


def get_pagination_links(category_url):
    """Extracts all pagination links from a category page."""
    page_urls = set([category_url])
    current_url = category_url

    while True:
        response = requests.get(current_url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all pagination links
        pagination_links = soup.select(".m-ghostblock__nav-item[href]")
        next_page_button = soup.select_one(".m-ghostblock__nav-item.arrow[data-href]")

        # Add pagination links
        for link in pagination_links:
            url = link["href"].strip()
            if url and url not in page_urls:
                page_urls.add(url)

        # Handle "Next" button
        if next_page_button:
            next_url = next_page_button["data-href"].strip()
            if next_url and next_url not in page_urls:
                page_urls.add(next_url)
                current_url = next_url
                time.sleep(1)
            else:
                break
        else:
            break

    return list(page_urls)


def scrape_category(category_title, category_url):
    """Scrapes all products from a category and its paginated pages."""
    for page_url in get_pagination_links(category_url):
        print(f"Scraping category page: {page_url}")
        response = requests.get(page_url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        for product in soup.select(".category-grid-tile.a-category-grid-tile"):
            link_tag = product.select_one(".category-grid-tile__link-wrapper.trackingElement")
            if link_tag:
                product_name = link_tag.get("title", "").strip()
                product_url = link_tag.get("href", "").strip()

                if product_url and not product_url.startswith("http"):
                    product_url = "https://www.boschtools.com" + product_url

                # Scrape product details
                product_details = scrape_product_details(product_url)

                # Save data
                data.append({
                    "Category Name": category_title,
                    "Category Link": category_url,
                    "Product Name": product_name,
                    "Product Link": product_url,
                    **product_details
                })
        time.sleep(1)


def scrape_product_details(product_url):
    """Extracts all available technical data metrics from the product page."""
    response = requests.get(product_url, headers=HEADERS, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")

    tech_data = {}

    # Locate the technical data table
    table = soup.select_one(".o-technical_data table")

    if table:
        for row in table.select("tbody tr"):
            cells = row.find_all("td")
            if len(cells) == 2:
                metric = cells[0].text.strip()
                value = cells[1].text.strip()
                tech_data[metric] = value

    return tech_data


# Scrape all categories
for _, row in df.iterrows():
    scrape_category(row["Title"], row["Links"])
    time.sleep(2)

# Save data to Excel
pd.DataFrame(data).to_excel("Bosch_Products_All_Metrics.xlsx", index=False)
print("Scraping complete. Data saved to Bosch_Products_All_Metrics.xlsx")
