In [1]:
!pip install requests beautifulsoup4



In [7]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random
from lxml import html


In [8]:
def create_robust_session():
    session = requests.Session()
    
    # Configure retry strategy
    retries = Retry(
        total=5,  # number of retries
        backoff_factor=0.5,  # wait 0.5, 1, 2, 4... seconds between retries
        status_forcelist=[500, 502, 503, 504, 404, 403],
    )
    
    # Mount the adapter with retry strategy
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    # Set multiple rotating User-Agents
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59'
    ]
    
    session.headers.update({
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0'
    })
    
    return session

In [9]:
# Function to safely make requests
def safe_get(url, session=None, max_retries=3):
    if session is None:
        session = create_robust_session()
    
    for attempt in range(max_retries):
        try:
            # Add random delay between requests
            time.sleep(random.uniform(1, 3))
            
            response = session.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            print(f"Successfully retrieved page on attempt {attempt + 1}")
            return response
            
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error on attempt {attempt + 1}: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(random.uniform(5, 10))  # Longer delay between retries
            
        except requests.exceptions.RequestException as e:
            print(f"Request error on attempt {attempt + 1}: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(random.uniform(5, 10))
    
    return None


In [6]:
# Function to extract list items
def extract_list_items(response):
    """Extract href attributes from the specified xpath using both lxml and BeautifulSoup for verification."""
    results = {
        'lxml': [],
        'bs4': []
    }
    
    try:
        # Parse with lxml
        tree = html.fromstring(response.content)
        # XPath to select all relevant <a> tags under each <li>
        list_items = tree.xpath('/html/body/div[1]/div/div[2]/main/div/div/div[2]/div[1]/ul/li//h2/a')
        
        print(f"Found {len(list_items)} items using lxml")
        
        for item in list_items:
            href = item.get("href")
            if href:
                results['lxml'].append(href)
                print(f"Found href with lxml: {href}")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.content, 'lxml')
        ul_element = soup.select_one('div:nth-child(1) > ul')
        
        if ul_element:
            # CSS selector to match the same path in BeautifulSoup
            bs4_items = ul_element.select('li h2 a')
            print(f"Found {len(bs4_items)} items using BeautifulSoup")
            
            for item in bs4_items:
                href = item.get("href")
                if href:
                    results['bs4'].append(href)
                    print(f"Found href with BeautifulSoup: {href}")
                    
    except Exception as e:
        print(f"Error in extraction: {e}")
    
    return results


In [None]:
# Fourth cell - Main execution
def main():
    url = "https://www.edmunds.com/volkswagen/beetle/2012/"
    
    try:
        # Create session and get page
        session = create_robust_session()
        response = safe_get(url, session)
        
        if response and response.status_code == 200:
            # Extract data
            results = extract_list_items(response)
            
            # Print results from lxml
            print("\nResults from lxml:")
            print("------------------")
            for idx, href in enumerate(results['lxml'], 1):
                print(f"{idx}. URL: {href}")
            
            # Print results from BeautifulSoup
            print("\nResults from BeautifulSoup:")
            print("-------------------------")
            for idx, href in enumerate(results['bs4'], 1):
                print(f"{idx}. URL: {href}")
            
            # Create DataFrame for results
            df_lxml = pd.DataFrame(results['lxml'], columns=['URL'])
            df_bs4 = pd.DataFrame(results['bs4'], columns=['URL'])
            
            print(df_lxml)
            print(df_bs4)
            
            # Save to CSV
            #df_lxml.to_csv('vw_beetle_urls_lxml.csv', index=False)
            #df_bs4.to_csv('vw_beetle_urls_bs4.csv', index=False)
            
            #print("\nData saved to CSV files")
            
            # Compare results
            #print("\nComparison of results:")
            #print(f"LXML found {len(results['lxml'])} URLs")
            #print(f"BeautifulSoup found {len(results['bs4'])} URLs")
            
        else:
            print("Failed to retrieve the page")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")


In [None]:
# Fifth cell - Run the script
if __name__ == "__main__":
    main()