# Web Scraper Syacker

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Function for a single page
def scrape_page(driver):
    sets = driver.find_elements(By.CSS_SELECTOR, "article.set")  
    data = []
    
    for set_ in sets:
        try:
            # Extract 'Id'
            id_element = set_.find_element(By.XPATH, ".//div[@class='meta']//a[contains(@href, '/sets/')]")
            lego_id = id_element.text.strip()

            # Extract 'Name'
            name_element = set_.find_element(By.XPATH, ".//h1/a")
            lego_name = name_element.text.strip()

            # Extract 'Pieces'
            pieces_element = set_.find_element(By.XPATH, ".//dt[normalize-space()='Pieces']/following-sibling::dd")
            pieces = pieces_element.text.strip()

            # Extract 'Minifigs'
            minifigs_element = set_.find_element(By.XPATH, ".//dt[normalize-space()='Minifigs']/following-sibling::dd")
            minifigs = minifigs_element.text.strip()

            # Extract 'RRP'
            rrp_element = set_.find_element(By.XPATH, ".//dt[normalize-space()='RRP']/following-sibling::dd")
            rrp = rrp_element.text.strip().split("|")[0]  # takes the first RRP

            data.append({
                "id": lego_id,
                "SetName": lego_name,
                "Pieces": pieces,
                "Minifigs": minifigs,
                "USRetailPrice": rrp
            })
        except NoSuchElementException:
            print("Some elements are missing on this page.")
            continue

    return data

# Main scraping function
def scrape_lego_sets(base_url, max_pages=12000):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  
    service = Service("path/to/chromedriver")  
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    all_data = []
    current_page = 1

    try:
        while current_page <= max_pages:
            print(f"Scraping page {current_page}...")
            url = f"{base_url}/page-{current_page}"
            driver.get(url)
            time.sleep(2)  # Wait for the page to load
            
            # Scrape current page
            page_data = scrape_page(driver)
            all_data.extend(page_data)

            # check for next
            try:
                next_button = driver.find_element(By.LINK_TEXT, str(current_page + 1))
                current_page += 1
            except NoSuchElementException:
                print("No more pages found.")
                break

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()
    
    # Convert data to DataFrame
    df = pd.DataFrame(all_data)
    return df

# Set base URL
BASE_URL = "https://brickset.com/sets/query-9196"

# Run the scraper
lego_df = scrape_lego_sets(BASE_URL, max_pages=12000) 

# Save the results
lego_df.to_csv("Brickset-list.csv", index=False)

