In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
import re

In [2]:

def setup_driver():
    """Set up and return a configured Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    
    # Initialize the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    return driver


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def extract_texts_from_element(driver, locator):
    """
    Robust method to extract texts from an element
    
    Args:
        driver (webdriver): Selenium WebDriver instance
        locator (tuple): Selenium locator strategy (By, value)
    
    Returns:
        list: Unique text contents of spans
    """
    try:
        # Wait for the element to be present
        parent_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(locator)
        )
        
        # Verbose logging
        print("Element found successfully!")
        
        # Find all span elements
        spans = parent_div.find_elements(By.TAG_NAME, 'span')
        
        # Print number of spans found
        print(f"Number of spans found: {len(spans)}")
        
        # Extract text from spans, filtering out empty and duplicate texts
        texts = []
        for span in spans:
            try:
                text = span.text.strip()
                if text and text not in texts:
                    texts.append(text)
                    print(f"Extracted text: {text}")
            except Exception as span_error:
                print(f"Error extracting text from a span: {span_error}")
        
        return texts
    
    except TimeoutException:
        print("Timeout: Element not found within the specified time.")
        return []
    
    except NoSuchElementException:
        print("Element not found on the page.")
        return []
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

def main():
    # Setup WebDriver (Chrome in this example)
    driver = webdriver.Chrome()
    
    try:
        # Navigate to the page
        driver.get("YOUR_URL_HERE")
        
        # Multiple locator strategies to try
        locator_strategies = [
            (By.CLASS_NAME, 'W4Efsd'),  # Try class name
            (By.XPATH, '//div[contains(@class, "W4Efsd")]'),  # Try XPath
            (By.CSS_SELECTOR, 'div.W4Efsd')  # Try CSS Selector
        ]
        
        # Try different locator strategies
        for locator in locator_strategies:
            print(f"\nTrying locator: {locator}")
            texts = extract_texts_from_element(driver, locator)
            
            if texts:
                print("\nSuccessfully extracted texts:")
                print(texts)
                break
        else:
            print("Could not find the element using any of the provided strategies.")
    
    except Exception as e:
        print(f"An error occurred in main function: {e}")
    
    finally:
        # Close the browser
        driver.quit()

In [4]:
def extract_texts_from_element(driver, element):
    """
    Extract text from all span elements within a given element
    
    Args:
        driver (webdriver): Selenium WebDriver instance
        element (WebElement): Parent element containing spans
    
    Returns:
        list: Unique text contents of spans
    """
    # Find all span elements within the parent element
    spans = element.find_elements(By.TAG_NAME, 'span')
    
    # Extract text from spans, filtering out empty and duplicate texts
    texts = []
    for span in spans:
        text = span.text.strip()
        if text and text not in texts:
            texts.append(text)
    
    return texts


In [38]:

    
async def scrape_sunway_pyramid_directory():
    """Scrape the Sunway Pyramid mall directory."""
    url = "https://www.google.com/maps/place/Othaim+Mall/@24.6856632,44.1387095,7z/data=!4m11!1m2!2m1!1sothaim+mall!3m7!1s0x3e2f06e87cc9db33:0x1ba11d2574bec9d3!8m2!3d24.6856632!4d46.7754282!10e3!15sCgtvdGhhaW0gbWFsbCIDiAEBWg0iC290aGFpbSBtYWxskgEPc2hvcHBpbmdfY2VudGVy4AEA!16s%2Fg%2F11c1qb1ptg?entry=ttu&g_ep=EgoyMDI1MDMwMi4wIKXMDSoJLDEwMjExNDUzSAFQAw%3D%3D"
    driver = setup_driver()
    
    try:
        print("Accessing the Sunway Pyramid directory...")
        driver.get(url)
        
        # Wait for the page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".lXJj5c"))
        )

        time.sleep(10)  # Wait for new items to load

        # Extract all shop elements
        shop_data = []
        par = driver.find_element(By.XPATH, "//*[@jslog='107076;mutable:true;']")
        # print(par.get_attribute('outerHTML'))

        time.sleep(2)  # Wait for new items to load
         # Allow some time for initial content to load
        i=0
        # Click "Load More" until it disappears
        while True:
            try:
                i=i+1
                # Find all buttons with class ".btn-outline-danger"
                buttons = driver.find_elements(By.CSS_SELECTOR, ".lXJj5c")
                # Click only the button that contains "Load More"
                load_more_clicked = False
                for button in buttons:
                    if "" in button.text:
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)  # Scroll into view
                        time.sleep(1)  # Allow time for scrolling
                        # button.click()
                        time.sleep(2)  # Wait for new items to load
                        load_more_clicked = True
                        break  # Stop looking for other buttons once clicked
                
                # If no button was clicked, exit loop
                if not load_more_clicked or i==2:
                    print("No more 'Load More' button. All items loaded.")
                    break  

            except Exception as e:
                print(f"Error clicking 'Load More': {e}")
                break  # Exit loop if an error occurs


        
        cards = par.find_elements(By.XPATH, "//*[contains(@class, 'UaQhfb') and contains(@class, 'fontBodyMedium')]")
        # Method 1: Print HTML for all cards
        # for card in cards:
        #     print(card.get_attribute('outerHTML'))
            
        for card in reversed(cards):
            try:
                # Extract shop name
                title = card.find_element(By.XPATH, "//div[contains(@class, 'qBF1Pd') and contains(@class, 'fontHeadlineSmall')]")
                print(title.get_attribute('outerHTML'))
                # Append to list
                shop_data.append({"name": title})
            except Exception as e:
                print(f"Error processing card: {e}")

        # Convert to DataFrame and save as CSV
        df = pd.DataFrame(shop_data)
        # df.to_csv("sunway_pyramid_shops.csv", index=False)
        print(f"Successfully scraped {len(shop_data)} shops from Sunway Pyramid directory.")

        return df

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()


In [None]:

# Scrape the main directory
shops_df = await scrape_sunway_pyramid_directory()

Accessing the Sunway Pyramid directory...


In [None]:
# shops_df