!pip install webdriver_manager
!pip install openpyxl
!pip install --upgrade selenium

In [254]:
# Install the libraries
import time
import pandas as pd

# Install Classes to interacting with the web page
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException,ElementClickInterceptedException
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.chrome.options import Options
from collections import defaultdict



In [255]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-software-rasterizer')
chrome_options.add_argument('--disable-popup-blocking')

In [277]:
#Initiate the web driver
driver = webdriver.Chrome(options=chrome_options, service=Service(ChromeDriverManager().install()))

In [278]:

base_link = "https://www.ooyyo.com/germany/used-cars-for-sale/c=CDA31D7114D3854F111BFE6FAA651453/"

driver.get(base_link)
# Accept the cookie policy
accept_button_xpath = "//button[@mode='primary' and @size='large']/span[text()='AGREE']"
accept_button = driver.find_element(By.XPATH,accept_button_xpath)
accept_button.click()
time.sleep(2)
# maximize screen
driver.maximize_window()

In [279]:
# Zoom out to 25% otherwise the elements are blocked by advertisements.
driver.execute_script("document.body.style.zoom = '85%'")
time.sleep(2)

In [259]:
# Gathering information for each listed item from one page
def feature_collect(driver):
    offer_info = []
    # Exclude the cars on promotion, which are in the first and last position of the page
    offer_elements = driver.find_elements(By.XPATH, '//div[@class="resultset"]//a[@class="car-card-1"]')[1:-1]
    # Process each relevant item on the page
    for offer_element in offer_elements:
        # Create a dictionary for the Basic Info
        basic_info = {}
        # Scroll to the element using JavaScript
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", offer_element)

        # Custom wait for the element to be clickable
        max_attempts = 3
        current_attempt = 0
        for _ in range(max_attempts):
            try:
                time.sleep(1)
                # Try to click the element
                offer_element.click()
                break  # Break the loop if successful
            except ElementClickInterceptedException:
                # If not clickable, wait and retry
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'start'});", offer_element)
            except Exception as e:
                print(f"An error occurred while clicking element {offer_element}: {e}")

        else:
            print(f"Element {offer_element} not clickable after {max_attempts} attempts.")


        # Switch to the new window
        driver.switch_to.window(driver.window_handles[-1])

        time.sleep(1)

        # Find the basic info
        ul_element = driver.find_element(By.CLASS_NAME, 'basic-info')
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'start'});", ul_element)
        # Iterate through each <li> within the <ul>
        for li_element in ul_element.find_elements(By.TAG_NAME, 'li'):
            # Extract the text content of the <div> elements within each <li>
            div_elements = li_element.find_elements(By.TAG_NAME, 'div')
            label = div_elements[0].text.strip()
            value = div_elements[1].text.strip()
            basic_info[label] = value

        # Also include the contact link
        # Find the <a> element by class name and text content
        try:
            contact_link = driver.find_element(By.XPATH, "//a[contains(@class, 'btn-contact') and contains(text(), 'Contact seller')]")
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'start'});", contact_link)
            # Get the value of the href attribute
            href_value = contact_link.get_attribute("href")
            print("Contact link:", href_value)

        except (NoSuchElementException, TimeoutException):
            href_value = "NA"
            continue
        basic_info["Contact"] = href_value
        offer_info.append(basic_info)

        # Close the current window
        driver.close()

        # Switch back to the original window
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(2)

        
        
    return offer_info

In [260]:
def process_page(driver):
    country_results = []
    while True:
        try:
            offer_info = feature_collect(driver)
            # Add the basic_info to country_results
            time.sleep(1)
            country_results.append(offer_info)
            # Scroll down to the bottom of the page using JavaScript
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)

            next_button= WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@class='btn btn-lg btn-block btn-warning' and contains(text(), 'Next')]")))
            print("Next button found!")

            # Click the "Next" button
            driver.execute_script("arguments[0].click();", next_button)

            parent_iframe_id = "google_ads_iframe_4110858/OOYCRP_ooyyopremium/web_interstitial_0"

            if driver.find_elements(By.ID, parent_iframe_id):
                parent_iframe = driver.find_element(By.ID, parent_iframe_id)

                # Switch to the iframe
                driver.switch_to.frame(parent_iframe)

                # 2nd layer of iframe
                iframe_id_2nd = "ad_iframe"
                
                # Switch to the iframe
                driver.switch_to.frame(iframe_id_2nd)

                # Find and click the "Close ad" button inside the iframe
                dismiss_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.ID, "dismiss-button"))
                )
                dismiss_button.click()

                # After interacting with the iframe, switch back to the default content
                driver.switch_to.default_content()

            else:
                # Handle the case when the parent iframe is not present
                print("Parent iframe not found. Continuing with feature_collect.")

        except Exception as e:
            # Handle exceptions (e.g., when the "Next" button is not present)
            print(f"An error occurred: {e}")
            break  # Exit the loop if an error occurs or the "Next" button is not present

    return country_results


In [261]:
# Find the countries in the drop down menu
# Find the <div> element with class "o-select _js-qs-country"
o_select_div = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "o-select._js-qs-country")))
# Scroll to the element
driver.execute_script("arguments[0].scrollIntoView(true);", o_select_div)
o_select_div.click()
time.sleep(2)
# Find all the country selecter
country_elements = o_select_div.find_elements(By.TAG_NAME, 'li')
country_elements

[<selenium.webdriver.remote.webelement.WebElement (session="7dc4cea3220152376d543eae3859eaa6", element="f.C7740EC408BA72D8E04740BD56F3C1D3.d.9C245A1CB045559274904745DE01E68C.e.365")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7dc4cea3220152376d543eae3859eaa6", element="f.C7740EC408BA72D8E04740BD56F3C1D3.d.9C245A1CB045559274904745DE01E68C.e.366")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7dc4cea3220152376d543eae3859eaa6", element="f.C7740EC408BA72D8E04740BD56F3C1D3.d.9C245A1CB045559274904745DE01E68C.e.367")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7dc4cea3220152376d543eae3859eaa6", element="f.C7740EC408BA72D8E04740BD56F3C1D3.d.9C245A1CB045559274904745DE01E68C.e.368")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7dc4cea3220152376d543eae3859eaa6", element="f.C7740EC408BA72D8E04740BD56F3C1D3.d.9C245A1CB045559274904745DE01E68C.e.369")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7dc4cea3220152376d54

In [262]:
country_list = []
# Iterate through each <li> element to find the corresponding <div> containing the country name
for country_element in country_elements:
    # Find the <div> element containing the country name within each <li>
    country_div= country_element.find_element(By.TAG_NAME,'div')
    # Extract and print the text content of the <div>
    country_name = country_div.text
    country_list.append(country_name)

print(country_list)

['Austria', 'Belgium', 'Czech Republic', 'Denmark', 'France', 'Germany', 'Hungary', 'Italy', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Spain', 'Sweden', 'Switzerland', 'USA']


In [263]:

for i in range(len(country_elements)):
    time.sleep(1)
    #Type in the country name manually and click on "Enter"
    country_name = country_list[i]
    
    input_element = driver.find_element(By.CSS_SELECTOR, 'input.form-control')
    # Scroll to the input element
    driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'start'});", input_element)
    input_element.click()

    # Send keys to the input element
    input_element.send_keys(country_name)
    time.sleep(1)
    # Refresh the element
    input_element = driver.find_element(By.CSS_SELECTOR, 'input.form-control')

    # Simulate pressing the "Enter" key 
    input_element.send_keys(Keys.ENTER)

    time.sleep(2)
    # check if we arrive at the webpage for that specific country
    url = driver.page_source
    if country_name in driver.page_source:
        print(f"The country name '{country_name}' is present in the page source.")
    else:
        print(f"The country name '{country_name}' is not present in the page source.")
    # Pass the driver instance to the process_page function
    country_results = process_page(driver)

    # Flatten the nested structure
    flat_list_of_dicts = [item for sublist in country_results for item in sublist]
    # Use defaultdict to automatically fill missing keys with "NA"
    default_dict_list = [defaultdict(lambda: 'NA', d) for d in flat_list_of_dicts]
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(default_dict_list)
    #Save the df into an csv
    df.to_csv(f'{country_name}.csv', index=False)

    i+=1

The country name 'France' is present in the page source.
An error occurred: Message: no such element: Unable to locate element: {"method":"css selector","selector":".basic-info"}
  (Session info: chrome=122.0.6261.112); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00D28D33+51395]
	(No symbol) [0x00C95F91]
	(No symbol) [0x00B4E13A]
	(No symbol) [0x00B862BB]
	(No symbol) [0x00B863EB]
	(No symbol) [0x00BBC162]
	(No symbol) [0x00BA3ED4]
	(No symbol) [0x00BBA570]
	(No symbol) [0x00BA3C26]
	(No symbol) [0x00B7C629]
	(No symbol) [0x00B7D40D]
	GetHandleVerifier [0x010A64F3+3711107]
	GetHandleVerifier [0x010E58DA+3970154]
	GetHandleVerifier [0x010E0BC8+3950424]
	GetHandleVerifier [0x00DD9D39+776393]
	(No symbol) [0x00CA1764]
	(No symbol) [0x00C9C648]
	(No symbol) [0x00C9C7F9]
	(No symbol) [0x00C8DE20]
	BaseThreadInitThunk [0x77367BA9+25]
	RtlInitializeExceptionCh

In [281]:
driver.quit()