# Import Required Libraries
Import necessary libraries such as Selenium, BeautifulSoup, requests, and matplotlib.

In [None]:
# Import Required Libraries
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import requests
from PIL import Image
from io import BytesIO
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt

# Define Web Scraping Function
Define the `extraer_info` function to scrape product titles, descriptions, and image URLs from a given URL.

In [None]:
def extraer_info(url):
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service()  # Create the service for the driver
    driver = webdriver.Chrome(service=service, options=options)  # Start the browser with options
    driver.get(url)
    time.sleep(5)  # Wait for the page and JS to load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    titles = []
    descriptions = []
    image_urls = []
    gif_urls = []

    # Extended list of keywords to exclude irrelevant images
    blacklist_keywords = [
        'logo', 'loggo', 'logotipo', 'favicon', 'icon', 'ico', 'brandmark',
        'marca', 'log_', '_logo', 'log-', '-logo', 'logggo', 'watermark',
        'paypal', 'sistecredito', 'visa', 'mastercard', 'payment', 'pago',
        'credit', 'debit', 'secure', 'checkout', 'cart'
    ]

    for img in soup.find_all('img'):
        src = img.get('src') or img.get('data-src') or img.get('data-image')
        if src:
            full_url = urljoin(url, src)
            full_url_lower = full_url.lower()

            # Exclude images containing keywords in the URL
            if any(keyword in full_url_lower for keyword in blacklist_keywords):
                continue

            # Classify images as GIFs or normal images
            if full_url_lower.endswith('.gif'):
                gif_urls.append(full_url)
            else:
                image_urls.append(full_url)

    # Remove duplicates
    image_urls = list(set(image_urls))
    gif_urls = list(set(gif_urls))

    # Filter images by size (optional, requires requests and Pillow)
    filtered_image_urls = []
    for img_url in image_urls:
        try:
            response = requests.get(img_url, stream=True, timeout=5)
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content))
                width, height = img.size
                # Exclude small images (e.g., icons)
                if width > 100 and height > 100:  # Adjust minimum size as needed
                    filtered_image_urls.append(img_url)
        except Exception:
            continue

    # Add all titles as an additional value
    todos_los_titulos = titles

    print("Product Titles:", titles)
    print("Product Descriptions:", descriptions)
    print("Filtered Image URLs:", filtered_image_urls)
    print("GIF URLs:", gif_urls)

    # Return 5 values
    return titles, descriptions, todos_los_titulos, filtered_image_urls, gif_urls

# Extract and Filter Image URLs
Use the `extraer_info` function to extract and filter image URLs based on size and blacklist keywords.

In [None]:
url = "https://dyshopcol.com/products/parches-de-ojos-durazno-bioaqua"
titulo, descripcion, todos_los_titulos, imagenes, gifs = extraer_info(url)

# Display Filtered Images
Use matplotlib to display the filtered images retrieved from the URLs.

In [None]:
# Display images and URLs
for img_url in imagenes:
    print("URL:", img_url)
    response = requests.get(img_url)
    if response.status_code == 200:
        img = Image.open(BytesIO(response.content))
        plt.imshow(img)
        plt.axis('off')
        plt.show()
    else:
        print(f"Error loading image from {img_url}")

# Test with Example URLs
Test the `extraer_info` function with example URLs and print the results, including titles, descriptions, and image URLs.

In [None]:
# Example URL for testing
url = "https://stanley1913.co/hidratacion/3170-14508-termo-stanley-quick-flip-go.html#/3797-capacidad-36oz1l/4131-color-rose_quartz"
titles, descriptions, image_urls, gif_urls = extraer_info(url)