In [78]:
import requests
import re
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager

In [79]:
def setup_driver(url):
    """Setup and return the Selenium WebDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))
    driver.maximize_window()
    driver.get(url)
    return driver

In [80]:
def scrape_data(driver): 
    """Scrape data from the webpage and return it as a list of tuples."""
    for i in range(500):
        time.sleep(3)
        try:
            driver.find_element(By.CSS_SELECTOR, "div._38O09 > button").click()
            time.sleep(2)
        except NoSuchElementException:
            break
    time.sleep(5)
    
    products = []
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    for item in soup.find_all('li', class_='_3V_Ww'):
        product_name_div = item.find('div', {'data-aut-id': 'itemTitle'})
        product_name = product_name_div.text.strip() if product_name_div else 'No name'
        
        item_title_div = item.find('div', {'data-aut-id': 'itemTitle'})
        title_text = item_title_div['title'] if item_title_div else 'No title'
        
        fuel_match = re.search(r'\b(Bensin|Diesel|Electric)\b', title_text)
        fuel = fuel_match.group(1) if fuel_match else 'No fuel'
        
        transmission_match = re.search(r'\b(MT|AT|Manual|Automatic)\b', title_text)
        transmission = transmission_match.group(1) if transmission_match else 'No transmission'
        
        engine_match = re.search(r'(\d+(\.\d+)?)\s*(L|litre|liter|cc)?', title_text, re.IGNORECASE)
        engine = engine_match.group(1) if engine_match else 'No engine size'
        
        subtitle_div = item.find('div', class_='_21gnE')
        subtitle_text = subtitle_div['title'] if subtitle_div else 'No subtitle'
        
        year_match = re.search(r'(\d{4})', subtitle_text)
        year = year_match.group(1) if year_match else 'No year'
        
        kilometer_match = re.search(r'(\d+(\.\d+)?)\s?km', subtitle_text)
        kilometer = kilometer_match.group(1) if kilometer_match else 'No kilometer'
        
        price_div = item.find('span', class_='_1zgtX')
        price_text = price_div.text if price_div else 'No price'
        price = re.sub(r'[^\d]', '', price_text)
        
        location_div = item.find('div', {'data-aut-id': 'itemDetails'})
        location_text = location_div.text.strip() if location_div else 'No location'
        location = location_text.split(' ')[0] 
        
        products.append(
            (product_name, price, year, kilometer, fuel, transmission, engine, location)
        )
    return products

In [81]:
def save_to_csv(products, filename='Raw_data.csv'):
    """Save the list of products to a CSV file."""
    df = pd.DataFrame(products, columns=['Name', 'Price', 'Year','Kilometer', 'Fuel', 'Transmission', 'Engine', 'Location'])
    df.to_csv(filename, index=False)
    print('File has been created')

In [82]:
def main():
    url = 'https://www.olx.co.id/jakarta-dki_g2000007/mobil-bekas_c198'
    driver = setup_driver(url)
    products = scrape_data(driver)
    save_to_csv(products)

In [83]:
if __name__ == "__main__":
    main()

File has been created
