In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from random import randint
from datetime import datetime

In [5]:
class MercedesA150Scraper:
    def __init__(self):
        self.base_url = "https://www.autoscout24.be/nl/"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    
    def create_search_url(self, page=1):
        """Create search URL specifically for Mercedes A 150"""
        url = (f"{self.base_url}lst/mercedes-benz/a-klasse/"
               f"?sort=standard&desc=0&cy=B&atype=C&ustate=N%2CU&powertype=kw"
               f"&search_id=&filters=model-a_150&page={page}")
        return url
    
    def get_page_content(self, url):
        """Fetch page content with error handling and rate limiting"""
        try:
            time.sleep(randint(2, 5))
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            
            # Print status code and URL for debugging
            print(f"Status Code: {response.status_code}")
            print(f"URL: {url}")
            
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching page: {e}")
            return None
    
    def extract_price(self, listing):
        """Extract price with multiple selector attempts and debugging"""
        try:
            # Try multiple possible price selectors
            price_selectors = [
                'div[data-price]',  # Try data attribute
                'span[data-price]',
                'div.Price_price__XZDdY',
                'span.Price_price__XZDdY',
                '.PriceAndSeals_price__li8qU',  # New possible class
                '.PriceContainer_price__iQHNs',  # Another possible class
                '[class*="price"]'  # Generic price class search
            ]
            
            for selector in price_selectors:
                price_element = listing.select_one(selector)
                if price_element:
                    # Try different attributes
                    price = (price_element.get('data-price') or 
                            price_element.text.strip())
                    if price:
                        # Print found price for debugging
                        print(f"Found price: {price}")
                        return price
            
            # If no price found with selectors, try finding € symbol
            all_text = listing.get_text()
            €_index = all_text.find('€')
            if €_index != -1:
                # Extract price near € symbol
                price_text = all_text[€_index:€_index+20]  # Take 20 chars after €
                price_text = ''.join(c for c in price_text if c.isdigit() or c in '€,.')
                if price_text:
                    return price_text
            
            return 'N/A'
            
        except Exception as e:
            print(f"Error extracting price: {e}")
            return 'N/A'
    
    def parse_listing(self, listing):
        """Extract detailed information from a single A 150 listing"""
        try:
            # Debug print
            print("\nParsing new listing...")
            
            # Basic information
            title = listing.find('h2', class_='ListItem_title__znV2I')
            title = title.text.strip() if title else 'N/A'
            print(f"Found title: {title}")
            
            # Extract price using the new method
            price = self.extract_price(listing)
            print(f"Extracted price: {price}")
            
            # Initialize details dictionary
            details_dict = {
                'title': title,
                'price': price,
                'mileage': 'N/A',
                'year': 'N/A',
                'transmission': 'N/A',
                'fuel_type': 'N/A',
                'power_kw': 'N/A',
                'seller_location': 'N/A',
                'listing_url': 'N/A',
                'scrape_date': datetime.now().strftime("%Y-%m-%d")
            }
            
            # Extract URL
            url_element = listing.find('a', class_='ListItem_title__znV2I')
            if url_element and 'href' in url_element.attrs:
                details_dict['listing_url'] = 'https://www.autoscout24.be' + url_element['href']
            
            # Extract details from the specification table
            details = listing.find_all('span', class_='VehicleDetailTable_item__koEV4')
            
            for detail in details:
                detail_text = detail.text.strip()
                
                if 'km' in detail_text.lower():
                    details_dict['mileage'] = detail_text
                elif len(detail_text) == 4 and detail_text.isdigit():
                    details_dict['year'] = detail_text
                elif 'kw' in detail_text.lower() or 'pk' in detail_text.lower():
                    details_dict['power_kw'] = detail_text
                elif any(fuel in detail_text.lower() for fuel in ['benzine', 'diesel', 'hybride']):
                    details_dict['fuel_type'] = detail_text
                elif any(trans in detail_text.lower() for trans in ['automatisch', 'handgeschakeld']):
                    details_dict['transmission'] = detail_text
            
            # Extract seller location
            location = listing.find('span', class_='ListItem_location__K_a5t')
            if location:
                details_dict['seller_location'] = location.text.strip()
            
            return details_dict
            
        except Exception as e:
            print(f"Error parsing listing: {e}")
            return None
    
    def clean_price(self, price):
        """Clean price string to numeric value"""
        try:
            if price == 'N/A':
                return None
            # Remove € symbol and any thousand separators
            price = str(price).replace('€', '').replace('.', '').replace(',', '').strip()
            # Extract only digits
            price = ''.join(filter(str.isdigit, price))
            return float(price) if price else None
        except Exception as e:
            print(f"Error cleaning price {price}: {e}")
            return None
    
    def scrape_listings(self, max_pages=5):
        """Scrape multiple pages of A 150 listings"""
        all_listings = []
        total_listings = 0
        
        for page in range(1, max_pages + 1):
            print(f"\nScraping page {page}...")
            url = self.create_search_url(page)
            content = self.get_page_content(url)
            
            if not content:
                continue
            
            soup = BeautifulSoup(content, 'html.parser')
            listings = soup.find_all('article', class_='ListItem_article__qgyo_')
            
            print(f"Found {len(listings)} listings on page {page}")
            
            if not listings:
                print(f"No more listings found on page {page}")
                break
            
            for listing in listings:
                listing_data = self.parse_listing(listing)
                if listing_data:
                    if 'a 150' in listing_data['title'].lower():
                        all_listings.append(listing_data)
                        total_listings += 1
            
            print(f"Scraped page {page}, found {len(listings)} listings, {total_listings} total A 150s")
        
        # Convert to DataFrame
        df = pd.DataFrame(all_listings)
        
        # Clean price column
        df['price'] = df['price'].apply(self.clean_price)
        
        # Clean mileage column
        df['mileage'] = df['mileage'].str.replace('km', '').str.replace('.', '').str.strip()
        df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')
        
        return df

# Example usage
if __name__ == "__main__":
    scraper = MercedesA150Scraper()
    results = scraper.scrape_listings()
    
    # Save results to CSV with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f'mercedes_a150_listings_{timestamp}.csv'
    results.to_csv(filename, index=False)
    
    # Print some basic statistics
    print("\nScraping Summary:")
    print(f"Total listings found: {len(results)}")
    if len(results) > 0:
        print(f"\nPrice Statistics:")
        print(results['price'].describe())
        print(f"\nMileage Statistics:")
        print(results['mileage'].describe())

SyntaxError: invalid character '€' (U+20AC) (2673700002.py, line 58)