In [1]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

# Function to fetch webpage HTML content
def fetch_html(url):
    try:
        with urllib.request.urlopen(url) as response:
            html = response.read()
        return html
    except urllib.error.URLError as e:
        print("Error accessing the webpage:", e)
        return None

# Function to scrape product information from HTML
def scrape_products(html, url):
    products = []
    soup = BeautifulSoup(html, 'html.parser')

    # Find all product elements on the page
    product_elements = soup.find_all('div', class_='product')

    for product_element in product_elements:
        product = {}

        # Extract product name
        name_element = product_element.find('h2', class_='product-name')
        if name_element:
            product['name'] = name_element.text.strip()
        else:
            product['name'] = 'N/A'  # Placeholder for missing name

        # Extract product details
        details_element = product_element.find('div', class_='product-details')
        if details_element:
            product['details'] = details_element.text.strip()
        else:
            product['details'] = 'N/A'  # Placeholder for missing details

        # Extract product image
        image_element = product_element.find('img', class_='product-image')
        if image_element:
            product['image'] = image_element['src']
        else:
            product['image'] = 'Missing Image'  # Indicator for missing image
            # Save the link of the page with missing image
            product['page_link'] = url  # Assuming the current page URL is relevant

        products.append(product)

    return products

# Example usage
input_value = input("Enter the website URL: ")
html_content = fetch_html(input_value)

if html_content:
    products = scrape_products(html_content, input_value)

    # Create DataFrame from the scraped products
    df = pd.DataFrame(products, columns=['name', 'details', 'image', 'page_link'])

    # Filter products with missing images
    missing_images_df = df[df['image'] == 'Missing Image']

    # Save the missing images DataFrame to an Excel file
    output_file = 'missing_images.xlsx'
    missing_images_df.to_excel(output_file, index=False)

    print("Missing images saved to", output_file)


Enter the website URL: https://yoshops.com/
Missing images saved to missing_images.xlsx
