# Import Required Libraries
Import the necessary libraries, including SeleniumBase and any other required modules.

In [None]:
# Import Required Libraries
from dataclasses import dataclass
import pandas as pd
import random
from fp.fp import FreeProxy
import time
from seleniumbase import DriverContext
import requests
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the Product Dataclass
Define the Product dataclass to store product information such as title, price, about_product, rating, and reviews.

In [None]:
# Define the Product Dataclass
@dataclass
class Product:
    title: str
    price: float | str
    about_product: str | list[str]
    rating: float | str
    reviews: list[dict[str, str | float]]
    
    def __iter__(self):
        yield 'title', self.title
        yield 'price', self.price
        yield 'about_product', self.about_product
        yield 'rating', self.rating
        yield 'reviews', self.reviews

    @classmethod
    def save_product_data(cls, products: list['Product'], filename: str = "amazon_products.csv"):
        data = {
            "product_name": [p.title for p in products],
            "price": [p.price for p in products],
            "about_product": [p.about_product for p in products],
            "reviews": [p.reviews for p in products],
            "rating": [p.rating for p in products]
        }
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)

# Implement Scraper Class Using SeleniumBase
Create a new Scraper class that uses SeleniumBase methods for web scraping instead of the current implementation.

FreeProxyException: There are no working proxies at this time.

In [None]:
class ProxyManager:
    def __init__(self, 
                 timeout: float = 1.0,
                 https: bool = True,
                 country_codes: list[str] | None = None,
                 min_proxies: int = 3):
        self.country_codes = country_codes or ['US', 'GB']
        self.timeout = timeout
        self.https = https
        self.min_proxies = min_proxies
        self.proxies: list[str] = []
        self.failed_proxies: set = set()
        self.current_proxy = None
        self._refresh_proxies()

    def _validate_proxy(self, proxy: str) -> bool:
        """Test if proxy is working"""
        try:
            test_url = "http://www.google.com"
            proxies = {
                "http": f"http://{proxy}",
                "https": f"http://{proxy}"
            }
            response = requests.get(test_url, 
                                  proxies=proxies, 
                                  timeout=self.timeout)
            return response.status_code == 200
        except:
            return False

    def _refresh_proxies(self) -> None:
        """Fetch new proxies"""
        max_attempts = 3
        for attempt in range(max_attempts):
            try:
                new_proxies = []
                for country in self.country_codes:
                    proxy = FreeProxy(
                        country_id=[country],
                        timeout=self.timeout,
                        https=self.https,
                        rand=True
                    ).get()
                    if proxy and proxy not in self.failed_proxies:
                        if self._validate_proxy(proxy):
                            new_proxies.append(proxy)
                
                if len(new_proxies) >= self.min_proxies:
                    self.proxies = new_proxies
                    logger.debug(f"Refreshed proxy list: {len(self.proxies)} proxies")
                    return
                
                time.sleep(1)  # Delay between attempts
            except Exception as e:
                logger.error(f"Error refreshing proxies: {str(e)}")
                time.sleep(1)
        
        logger.error("Failed to refresh proxy list")

    def get_proxy(self) -> str | None:
        """Get next working proxy"""
        if len(self.proxies) < self.min_proxies:
            self._refresh_proxies()
            
        if not self.proxies:
            return None
            
        self.current_proxy = random.choice(self.proxies)
        logger.debug(f"Selected proxy: {self.current_proxy}")
        return self.current_proxy

    def remove_current_proxy(self) -> None:
        """Remove failed proxy and get new ones if needed"""
        if self.current_proxy:
            if self.current_proxy in self.proxies:
                self.proxies.remove(self.current_proxy)
            self.failed_proxies.add(self.current_proxy)
            logger.debug(f"Removed failed proxy: {self.current_proxy}")
            
        if len(self.proxies) < self.min_proxies:
            self._refresh_proxies()

In [None]:

class AliBabaScraper:
    def __init__(self, 
                 headless: bool = True,
                 load_images: bool = False,
                 window_size: tuple[int, int] = (700, 900),
                 max_retries: int = 3,
                 proxy_use: bool = False,
                 proxy_timeout: float = 1.0):
        """Initialize scraper with proxy rotation"""
        self.headless = headless
        self.block_images = not load_images
        self.window_size = f"{window_size[0]},{window_size[1]}"
        self.max_retries = max_retries
        self.proxy_use = proxy_use
        if self.proxy_use:
            self.proxy_manager = ProxyManager(timeout=proxy_timeout)
        logger.debug(f"Initializing Scraper - Headless: {headless}")
        
    def __enter__(self):
        logger.debug("Creating new driver instance")
        self._create_driver()
        return self
        
    def __exit__(self, exc_type, exc_val, exc_tb):
        logger.debug("Cleaning up driver")
        if hasattr(self, 'driver_context'):
            self.driver_context.__exit__(exc_type, exc_val, exc_tb)
            delattr(self, 'driver_context')
            delattr(self, 'driver')

    def _create_driver(self):
        """Create new driver instance with proxy"""
        if self.proxy_use:
            proxy = self.proxy_manager.get_proxy()
            logger.debug(f"Using proxy: {proxy}")
        
        # Explicitly pass headless mode
        self.driver_context = DriverContext(
            browser="chrome",  # Explicitly set browser
            headless=self.headless,  # Pass headless flag
            block_images=self.block_images,
            window_size=self.window_size,
            proxy=proxy if self.proxy_use else None
        )
        self.driver = self.driver_context.__enter__()
        
        # Verify browser visibility
        if not self.headless:
            self.driver.maximize_window()
            logger.debug("Browser window maximized")
        return self

    def get(self, url: str) -> bool:
        """Navigate to URL with proxy rotation and retries"""
        logger.debug(f"Navigating to: {url}")
        for attempt in range(self.max_retries):
            try:
                logger.debug(f"Attempt {attempt + 1} of {self.max_retries}")
                self.driver.get(url)
                if "captcha" not in self.driver.page_source.lower():
                    logger.debug("Page loaded successfully")
                    return True
            except Exception as e:
                logger.error(f"Error loading page: {str(e)}")
            if self.proxy_use:
                logger.debug("Rotating proxy and retrying...")
                self.proxy_manager.remove_current_proxy()
                self.__exit__(None, None, None)
                self._create_driver()
                time.sleep(1)
            else:
                break
        logger.error("All proxy attempts failed")
        return False

In [None]:
from bs4 import BeautifulSoup

search_query = "laptop"

def get_search_url(query: str) -> str:
    base_url = "https://www.alibaba.com/trade/search"
    params = {
        "spm": "a2700.product_home_newuser.home_new_user_first_screen_fy23_pc_search_bar.searchButton",
        "tab": "all",
        "SearchText": query
    }
    query_string = "&".join([f"{key}={value}" for key, value in params.items()])
    return f"{base_url}?{query_string}"

with AliBabaScraper(headless=False, load_images=True) as scraper:
    if scraper.get(get_search_url(search_query)):
        # Successful page load without bot detection
        data = scraper.driver.page_source

    else:
        print("All proxies failed")
        # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(data, 'html.parser')

    # Find all links on the page
    links = soup.find_all('a', href=True)

    # Filter and display only the links that contain 'alibaba.com/product-detail/'
    product_links = [link['href'] for link in links if 'alibaba.com/product-detail/' in link['href']]
    for product_link in product_links:
        print(product_link)
        

//www.alibaba.com/product-detail/14-inch-Win-10-11-brand_1600673527864.html
//www.alibaba.com/product-detail/14-inch-Win-10-11-brand_1600673527864.html
//www.alibaba.com/product-detail/Cheap-14-Inch-6GB-RAM-OEM_1601202528113.html?s=p
//www.alibaba.com/product-detail/Cheap-14-Inch-6GB-RAM-OEM_1601202528113.html?s=p
//www.alibaba.com/product-detail/Cheap-14-Inch-6GB-RAM-OEM_1601202528113.html?s=p
//www.alibaba.com/product-detail/Cheap-14-Inch-6GB-RAM-OEM_1601202528113.html?s=p
//www.alibaba.com/product-detail/Cheap-14-Inch-6GB-RAM-OEM_1601202528113.html?s=p
//www.alibaba.com/product-detail/Cheap-14-Inch-6GB-RAM-OEM_1601202528113.html?s=p
//www.alibaba.com/product-detail/CENAVA-12-Inch-IP67-MSM8953-Android_1600554046774.html
//www.alibaba.com/product-detail/CENAVA-12-Inch-IP67-MSM8953-Android_1600554046774.html
//www.alibaba.com/product-detail/CENAVA-12-Inch-IP67-MSM8953-Android_1600554046774.html
//www.alibaba.com/product-detail/CENAVA-12-Inch-IP67-MSM8953-Android_1600554046774.html
//ww