# Mini Project 1: Scraping Data from a Dynamic Webpage
- Install necessary Python libraries: selenium, beautifulsoup4 (bs4), and chromedriver-autoinstaller.
- Choose a dynamic webpage for scraping. For this project, we will scrape dynamic product data from a demo e-commerce site, like : [inmotionhosting](https://www.inmotionhosting.com/).


## Task

- Initialize Selenium WebDriver
- Load the Web Page
- Identify the elements that contain hosting plan details.
- Extract necessary data such as plan names, features, and pricing.
- Store and Save the Data
- Close Selenium WebDriver


In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd

In [11]:
# # Setup Selenium WebDriver
# options = webdriver.ChromeOptions()
# options.add_argument("--start-maximized")
# driver = webdriver.Chrome(options=options)

# # InMotion Hosting page
# url = "https://www.inmotionhosting.com/"
# driver.get(url)


# for i in range(5):  # Limit to 5 buttons
#     try:
#         # Wait for the container with buttons to load
#         wait = WebDriverWait(driver, 30)
#         container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.imh-rostrum-container")))
        
#         # Find all buttons within the container
#         buttons = container.find_elements(By.CSS_SELECTOR, "a.cta-link.btn-secondary-alt")
        
#         # Click the button
#         driver.execute_script("arguments[0].click();", buttons[i])
#         print(f"\nClicked button {i+1}") # Visual confirmation
        
#         # Wait for the page to load after clicking
#         WebDriverWait(driver, 10).until(EC.staleness_of(container))
        
#         # Process nested content
#         nested_soups = process_nested_content(driver)
#         print(f"Captured {len(nested_soups)} term soups for main button {i+1}")
        
#         # Get the page source and create a BeautifulSoup object
#         soup = BeautifulSoup(driver.page_source, 'html.parser')
#         soups.append(soup)
        
#         print(f"Captured soup for button {i+1}") # Visual confirmation
#         print('=' * 50)
        
#         # Navigate back to the initial page
#         driver.back()
        
#     except Exception as e:
#         print(f"An error occurred while processing button {i+1}: {str(e)}")

# driver.quit()

It clicks on the first button twice, then jumps to the third where all the structure is different

In [12]:
# setup selenium webdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)

# inmotion hosting page
url = "https://www.inmotionhosting.com/"
driver.get(url)

links = []  # list to store unique links
seen_links = set()  # set to track seen links for uniqueness

try:
    # wait for the container with buttons to load
    wait = WebDriverWait(driver, 30)
    container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.imh-rostrum-container")))
    
    # find all buttons within the container
    buttons = container.find_elements(By.CSS_SELECTOR, "a.cta-link.btn-secondary-alt")
    
    # extract links from all buttons while maintaining order
    for button in buttons:
        link = button.get_attribute("href")  # get the href attribute of the button
        if link and link not in seen_links:  # check if link is not None or empty and not already seen
            links.append(link)  # add link to the list to maintain order
            seen_links.add(link)  # add link to the set for uniqueness

except Exception as e:
    print(f"an error occurred: {str(e)}")

# print all collected unique links in order of appearance
print("collected unique links:")
for link in links:
    print(link)

driver.quit()

collected unique links:
https://www.inmotionhosting.com/shared-hosting
https://www.inmotionhosting.com/vps-hosting
https://www.inmotionhosting.com/dedicated-servers
https://www.inmotionhosting.com/wordpress-hosting
https://www.inmotionhosting.com/wordpress-hosting/managed-wordpress


Now we browse to each of the adresses retrieved by clicking the plan buttons.

Some of the nested pages require to accept the cookies:

In [13]:
def accept_cookies(driver):
    """
    click and accept all cookies on the page if the button is present.
    """
    try:
        # wait for the cookie consent button to be present
        wait = WebDriverWait(driver, 10)
        cookie_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.cookie-accept-button")))  # Update the selector as needed
        
        # click the button to accept cookies
        cookie_button.click()
        print("accepted cookies")
        
    except Exception as e:
        print(f"error accepting cookies: {str(e)}")

Scrap the first link: open the link and click all found buttons

In [14]:
# setup selenium webdriver for the first url
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)

# get the first url from the links list
url = links[0]

driver.get(url)  # navigate to the current url

# wait for the specific section to be present
wait = WebDriverWait(driver, 10)
section = wait.until(EC.presence_of_element_located((By.ID, "shared-hosting-rostrum")))

# define the title variable by finding the h2 element
container = section.find_element(By.CSS_SELECTOR, "div.container")
plan_title = container.find_element(By.TAG_NAME, "h2").text  # get the text of the h2 element

# find all term selector buttons within this specific section
term_buttons = section.find_elements(By.CSS_SELECTOR, "button.imh-term-selector")

# list to store dictionaries for each button clicked
soups = []

# click each button, print its name, and create a soup
for button in term_buttons:
    try:
        # scroll the button into view
        driver.execute_script("arguments[0].scrollIntoView(true);", button)
        
        # wait until the button is clickable
        wait.until(EC.element_to_be_clickable(button))

        # click the button using JavaScript as a fallback
        driver.execute_script("arguments[0].click();", button)
        
        print(button.text)  # print the name of the button
        
        # wait for the content to load after clicking
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.imh-rostrum-container")))
        
        # create a soup from the specified division after clicking
        container_html = driver.find_element(By.CSS_SELECTOR, "div.imh-rostrum-container").get_attribute('outerHTML')
        soup = BeautifulSoup(container_html, 'html.parser')
        
        # append a dictionary with button text and soup to the list
        soups.append({'button': button.text, 'soup': soup})  # store as a dictionary

    except Exception as e:
        print(f"Error clicking button {button.text}: {str(e)}")

driver.quit()  # close the driver after processing

3 Year
1 Year
1 Month


After extracted all the soups in a list, they are retrieved into a dataframe:

In [15]:
# initialize lists to store data for all soups
all_titles = []
all_sub_titles = []
all_pricing = []
all_details = []

# iterate through each soup in the soups list
for soup_data in soups:
    first_soup = soup_data['soup']  # get the current soup
    button_text = soup_data['button']  # get the button text for this soup

    # find all imh-rostrum-card elements
    cards = first_soup.find_all(class_='imh-rostrum-card')

    # extract content from each card
    for card in cards:
        # extract title (h3)
        title = card.find('h3').text.strip() if card.find('h3') else 'No Title'
        all_titles.append(f'{button_text} {title}')  # prepend button text to title

        # extract sub-title
        sub_title = card.find(class_='imh-rostrum-sub-title').text.strip() if card.find(class_='imh-rostrum-sub-title') else ''
        all_sub_titles.append(sub_title)

        # extract pricing
        pricing_container = card.find(class_='imh-pricing-container')
        price = pricing_container.text.strip() if pricing_container else ''
        all_pricing.append(price)

        # extract details list
        details_list = card.find(class_='imh-rostrum-details-list')
        if details_list:
            details_items = [li.text.strip() for li in details_list.find_all('li')]
            all_details.append(details_items)
        else:
            all_details.append([])

# create dtaframe from the collected data
df = pd.DataFrame({
    'Plan': all_titles,
    'Sub-Title': all_sub_titles,
    'Pricing': all_pricing,
    'Details': all_details
})

# kraken
print(df)

# save it into a csv
df.to_csv(f'{plan_title}.csv', index=False, encoding='utf-8')

              Plan                                          Sub-Title  \
0      3 Year Core  Suitable for simple sites, with everything you...   
1    3 Year Launch  Designed for seamless multi-site management & ...   
2     3 Year Power  Powerful resources perfect for larger sites & ...   
3       3 Year Pro  Ideal for high-traffic or eCommerce sites with...   
4      1 Year Core  Suitable for simple sites, with everything you...   
5    1 Year Launch  Designed for seamless multi-site management & ...   
6     1 Year Power  Powerful resources perfect for larger sites & ...   
7       1 Year Pro  Ideal for high-traffic or eCommerce sites with...   
8     1 Month Core  Suitable for simple sites, with everything you...   
9   1 Month Launch  Designed for seamless multi-site management & ...   
10   1 Month Power  Powerful resources perfect for larger sites & ...   
11     1 Month Pro  Ideal for high-traffic or eCommerce sites with...   

                                              Pric

Then rinse and repeat, each nested page from the buttons has a different structure. So we need to redo a new code for each link extracted in the list `links`

Also its 3:00 AM, a new personal record!!!! i go to sleep