 Web Scraping with Selenium: From Static to Dynamic Pages


1. Introduction to Web Scraping Web scraping is the process of extracting data from websites automatically.
We use it when:

   - APIs are not available or limited

   - Data needs to be collected from multiple sources

   - Real-time data extraction is required

Types of Pages:

A. Static Pages: Content is fixed and doesn't change after loading

B. Dynamic Pages: Content changes based on user interactions (JavaScript)

**2. Selenium Setup and Installation
First, let's install the required packages:**

In [None]:
pip install selenium beautifulsoup4 pandas webdriver-manager

we'll use Wikipedia as our example website.

3. Basic Selenium Concepts
WebDriver Setup

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import pandas as pd

# Setup WebDriver with automatic management
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Initialize driver
driver = setup_driver()

Key Selenium Methods
driver.get(url) - Navigate to URL

driver.find_element(By.*, "selector") - Find single element

driver.find_elements(By.*, "selector") - Find multiple elements

element.click() - Click on element

element.send_keys("text") - Send text to input field

driver.back(), driver.forward() - Navigation

4. Static Page Scraping - Wikipedia Example
Static pages are easier to scrape as content loads immediately.

In [None]:
def scrape_wikipedia_static():
    """Scrape static content from Wikipedia main page"""
    try:
        # Navigate to Wikipedia
        driver.get("https://en.wikipedia.org/wiki/Main_Page")
        
        # Wait for page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "mp-tfa"))
        )
        
        # Extract featured article
        featured_article = driver.find_element(By.ID, "mp-tfa")
        article_title = featured_article.find_element(By.TAG_NAME, "p").text
        article_link = featured_article.find_element(By.TAG_NAME, "a").get_attribute("href")
        
        print("Featured Article:", article_title)
        print("Link:", article_link)
        
        # Extract news headlines
        news_section = driver.find_element(By.ID, "mp-itn")
        news_items = news_section.find_elements(By.TAG_NAME, "li")
        
        news_data = []
        for item in news_items:
            news_data.append({
                'text': item.text,
                'link': item.find_element(By.TAG_NAME, "a").get_attribute("href")
            })
        
        return {
            'featured_article': {'title': article_title, 'link': article_link},
            'news_items': news_data
        }
        
    except Exception as e:
        print(f"Error scraping static content: {e}")
        return None

# Execute static scraping
static_data = scrape_wikipedia_static()
print("Static Scraping Results:")
print(static_data)

5. Dynamic Page Scraping - Wikipedia Search
Dynamic content requires waiting for elements to load or interactions.

In [None]:
def scrape_wikipedia_dynamic(search_term="Data science"):
    """Scrape dynamic content using Wikipedia search"""
    try:
        # Navigate to Wikipedia
        driver.get("https://en.wikipedia.org/wiki/Main_Page")
        
        # Find search box and enter search term
        search_box = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "searchInput"))
        )
        search_box.clear()
        search_box.send_keys(search_term)
        
        # Click search button
        search_button = driver.find_element(By.ID, "searchButton")
        search_button.click()
        
        # Wait for search results to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "firstHeading"))
        )
        
        # Extract page title
        page_title = driver.find_element(By.ID, "firstHeading").text
        
        # Extract content paragraphs
        content_paragraphs = driver.find_elements(
            By.CSS_SELECTOR, "#mw-content-text .mw-parser-output > p"
        )
        
        # Extract table of contents (dynamic element)
        toc_section = driver.find_element(By.ID, "toc")
        toc_items = toc_section.find_elements(By.TAG_NAME, "li")
        
        toc_data = []
        for item in toc_items:
            toc_data.append(item.text)
        
        # Extract references (might load dynamically)
        references = driver.find_elements(
            By.CSS_SELECTOR, ".references li"
        )
        
        ref_data = []
        for ref in references[:5]:  
            ref_data.append(ref.text)
        
        return {
            'search_term': search_term,
            'page_title': page_title,
            'content_preview': content_paragraphs[0].text if content_paragraphs else "",
            'toc': toc_data,
            'references_sample': ref_data
        }
        
    except Exception as e:
        print(f"Error scraping dynamic content: {e}")
        return None

# Execute dynamic scraping
dynamic_data = scrape_wikipedia_dynamic("Machine learning")
print("Dynamic Scraping Results:")
print(dynamic_data)

6. Best Practices and Ethics

   


1. Respect robots.txt

In [None]:
def check_robots_txt(url):
    """Check if scraping is allowed"""
    import requests
    robots_url = f"{url}/robots.txt"
    try:
        response = requests.get(robots_url)
        print("Robots.txt content:")
        print(response.text)
    except:
        print("Could not fetch robots.txt")

check_robots_txt("https://en.wikipedia.org")

2. Rate Limiting and Delays

In [None]:
import random
import time

def respectful_delay():
    """Add random delay between requests"""
    time.sleep(random.uniform(1, 3))

def scrape_with_respect(url):
    """Scrape with respect to website resources"""
    respectful_delay()
    driver.get(url)
    # Your scraping code here

3. Error Handling

In [None]:
def robust_scraping():
    """Scraping with comprehensive error handling"""
    try:
        # Your scraping code
        pass
    except NoSuchElementException:
        print("Element not found")
    except TimeoutException:
        print("Page loading timeout")
    except Exception as e:
        print(f"Unexpected error: {e}")
    finally:
        driver.quit()

7. Complete  Our Project

   
Let's create a complete Wikipedia scraper that handles both static and dynamic content:

In [None]:
class WikipediaScraper:
    def __init__(self):
        self.driver = setup_driver()
        self.wait = WebDriverWait(self.driver, 10)
    
    def scrape_article(self, article_title):
        """Complete article scraping with metadata"""
        try:
            # Navigate directly to article
            url = f"https://en.wikipedia.org/wiki/{article_title.replace(' ', '_')}"
            self.driver.get(url)
            
            # Wait for page to load
            self.wait.until(EC.presence_of_element_located((By.ID, "firstHeading")))
            
            # Extract basic info
            title = self.driver.find_element(By.ID, "firstHeading").text
            content = self.driver.find_element(By.ID, "mw-content-text").text[:500] + "..."
            
            # Extract infobox (if exists)
            infobox_data = self._extract_infobox()
            
            # Extract categories
            categories = self._extract_categories()
            
            # Extract external links
            external_links = self._extract_external_links()
            
            return {
                'title': title,
                'url': url,
                'content_preview': content,
                'infobox': infobox_data,
                'categories': categories,
                'external_links': external_links
            }
            
        except Exception as e:
            print(f"Error scraping article: {e}")
            return None
    
    def _extract_infobox(self):
        """Extract data from infobox"""
        try:
            infobox = self.driver.find_element(By.CLASS_NAME, "infobox")
            rows = infobox.find_elements(By.TAG_NAME, "tr")
            
            data = {}
            for row in rows:
                try:
                    th = row.find_element(By.TAG_NAME, "th")
                    td = row.find_element(By.TAG_NAME, "td")
                    data[th.text] = td.text
                except:
                    continue
            
            return data
        except:
            return {}
    
    def _extract_categories(self):
        """Extract article categories"""
        try:
            cat_links = self.driver.find_elements(
                By.CSS_SELECTOR, "#mw-normal-catlinks ul li a"
            )
            return [link.text for link in cat_links]
        except:
            return []
    
    def _extract_external_links(self):
        """Extract external references"""
        try:
            ext_links = self.driver.find_elements(
                By.CSS_SELECTOR, ".external.text")
            return [link.get_attribute("href") for link in ext_links[:5]]
        except:
            return []
    
    def search_and_scrape(self, search_term):
        """Search Wikipedia and scrape first result"""
        try:
            # Perform search
            self.driver.get("https://en.wikipedia.org/wiki/Main_Page")
            search_box = self.wait.until(
                EC.element_to_be_clickable((By.ID, "searchInput"))
            )
            search_box.clear()
            search_box.send_keys(search_term)
            search_box.submit()
            
            # Wait for results and click first result
            self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".mw-search-results li"))
            )
            first_result = self.driver.find_element(
                By.CSS_SELECTOR, ".mw-search-results li a"
            )
            article_url = first_result.get_attribute("href")
            
            # Scrape the article
            return self.scrape_article(article_url.split('/')[-1])
            
        except Exception as e:
            print(f"Error in search and scrape: {e}")
            return None
    
    def close(self):
        """Clean up"""
        self.driver.quit()

# Usage example
def main():
    scraper = WikipediaScraper()
    
    # Example 1: Direct article scraping
    print("Scraping 'Data Science' article:")
    data_science_data = scraper.scrape_article("Data science")
    print(data_science_data)
    
    # Example 2: Search and scrape
    print("\nSearching for 'Artificial Intelligence':")
    ai_data = scraper.search_and_scrape("Artificial Intelligence")
    print(ai_data)
    
    # Example 3: Multiple articles
    topics = ["Machine Learning", "Python (programming language)", "Web scraping"]
    results = []
    
    for topic in topics:
        print(f"\nScraping: {topic}")
        result = scraper.scrape_article(topic)
        if result:
            results.append(result)
        respectful_delay()
    
    # Save to CSV
    df = pd.DataFrame(results)
    df.to_csv("wikipedia_data.csv", index=False)
    print("\nData saved to wikipedia_data.csv")
    
    scraper.close()

if __name__ == "__main__":
    main()

Key Takeaways:

Static vs Dynamic: Understand when content loads immediately vs when it requires interaction

Waiting Strategies: Use explicit waits for dynamic content loading

Error Handling: Always implement robust error handling

Ethical Scraping: Respect robots.txt, add delays, and don't overload servers

Data Organization: Structure your scraped data for easy analysis