Importing Packages For Scraping

In [1]:
from bs4 import BeautifulSoup as soup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

Configuration for Facebook Marketplace (Modify As Needed)

In [2]:
# Modify City As Needed [Code below is for Istanbul, Turkey] - 
city = 'istanbul'

# Specific Cars - 
car_search = True # Set to True if you want to search for specific cars, False if you have no car in mind.

# Base URL -
base_url = "https://www.facebook.com/marketplace/" + city

# Configurations - 
base_url += "/search?" if car_search else "/vehicles"

# Vehicles - 
vehicles = ['Fiat Egea']

Query Format Function

In [3]:
def search_format(car: bool, params: dict, vehicle_name: str = '') -> str:
    if car:
        vehicle_name = vehicle_name.lower().replace(' ', '%20')
        search_url = f"{base_url}query={vehicle_name}"
    else:
        search_url = f"{base_url}"
    # Adding Parameters -
    if params['minPrice'] > 0:
        search_url += f"&minPrice={params['minPrice']}"
    if params['maxPrice'] > 0:
        search_url += f"&maxPrice={params['maxPrice']}"
    if params['exactPrice']:
        search_url += f"&exactPrice={params['exactPrice']}"
    if params['sortBy']:
        search_url += f"&sortBy={params['sortBy']}"
    return search_url

Sorting and Search Configuration

In [4]:
# Filter Params - 
sorting_options = [
    'best_match', 
    'price_ascend', 
    'price_descend', 
    'creation_time_descend', 
    'creation_time_ascend', 
    'distance_descend', 
    'distance_ascend', 
    'vehicle_mileage_descend', 
    'vehicle_mileage_ascend', 
    'vehicle_year_descend', 
    'vehicle_year_ascent'
]

# Search Params (Modify As Required) - 
search_params = {
    'minPrice': 100000,
    'maxPrice': 2000000,
    'exactPrice': '&exact=false',
    'sortBy': sorting_options[2],
}

Constructing URL(s) For Search **(if running this, do not run the cell below)**

In [5]:
urls = []

for vehicle in vehicles:
    print(vehicle)
    urls.append(search_format(car_search, search_params, vehicle))

print(urls)

Fiat Egea
['https://www.facebook.com/marketplace/istanbul/search?query=fiat%20egea&minPrice=100000&maxPrice=2000000&exactPrice=&exact=false&sortBy=price_descend']


Constructing URL for Browsing **(if running this, do not run the cell above)**

In [6]:
urls = []

for option in sorting_options:
    search_params.update({'sortBy': option})
    urls.append(search_format(car_search, search_params))

Browser Configuration

In [11]:
from webdriver_config import DRIVER_PATH

# Incognito - 
chrome_options = Options()
chrome_options.add_argument("--incognito")

# Headless - if you do not want the browser to open, set to True. Otherwise, False.
headless = False
chrome_options.add_argument("headless") if headless else None

# Modify as needed -
scroll_delay = 3
scrolls = 1

# Adding Selenium Service to Browser To Close Pop-Up
service = Service(DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=chrome_options)

Function for Visiting and Scraping URL

In [12]:
def scraping(url: str, scroll_delay: int, scrolls: int, headless: bool) -> str:
    driver.get(url)

    # Sleep for 1 second for pop-up to display, and to close popup.
    if not headless:
        time.sleep(2)
        close_popup = driver.find_element(By.CSS_SELECTOR, "div[aria-label='Close']")
        close_popup.click()

    for _ in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_delay)

    # Get Page Source - 
    return driver.page_source

Function for Parsing HTML -

In [14]:
features_classes = {
    'listing': 'x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24',
    'price': 'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u',
    'model': 'x1lliihq x6ikm8r x10wlt62 x1n2onr6',
    'city': 'x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84',
    'odometer': 'x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84',
    'highlighted_details': 'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x6prxxf xvq8zen xo1l8bm xzsf02u',
    'details': 'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u'
}

def parse(page_source: str) -> list:
    page_soup = soup(page_source, 'html.parser')
    listings = page_soup.find_all('div', {'class': features_classes.get('listing')})
    data = []
    for listing in listings:
        listing_data = {
            'listing_html': listing,
            'highlighted_details': '',
            'details': ''
        }
        data.append(listing_data)

    return data

Functions for Clicking and Extracting Details For Each Listing - 

In [15]:
def extract_details(listing: str) -> str:
    highlighted_details_item = listing.find_all('span', {'class': features_classes.get('highlighted_details')})
    details_items = listing.find_all('span', {'class': features_classes.get('details')})

    highlighted_details = ' '.join([item.text.strip() for item in highlighted_details_item]) if highlighted_details_item else np.nan
    details = ' '.join([item.text.strip() for item in details_items]) if details_items else np.nan

    return highlighted_details, details

def click_to_extract(listing) -> str:
    # Click on Listing to Access Details -
    listing.click()
    time.sleep(4)

    # Get Page Source -
    page_source = driver.page_source
    listing_soup = soup(page_source, 'html.parser')
    driver.back()
    time.sleep(1)

    # Extract Details - 
    return extract_details(listing_soup)

Visiting URLs and Parsing

In [16]:
# Initializing dictionary with empty lists for each url.
listings_dict = {url: [] for url in urls}

for url in urls:
    page_source = scraping(url, scroll_delay, scrolls, headless)
    listings = parse(page_source)
    listings_dict[url] = listings
    print('Listings Found: ', len(listings))
    
    # Extracting Details By Clicking - 
    elements = driver.find_elements(By.CLASS_NAME, 'x1uepa24') # A snippet from listing class.
    for i in range(len(elements)):
        try:
            highlighted_details, details = click_to_extract(elements[i])
            # Append details as a dictionary to the listings_dict
            listings_dict[url][i]['highlighted_details'] = highlighted_details if highlighted_details else np.nan
            listings_dict[url][i]['details'] = details if details else np.nan
        except Exception as e:
            print(f"An error occurred while processing element {i}: {e}")

Listings Found:  147
An error occurred while processing element 2: Message: element click intercepted: Element <div class="x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" style="max-width: 281.94px; min-width: 179.08px;">...</div> is not clickable at point (512, 532). Other element would receive the click: <div class="xxqbpr x1gja9t x17p1517 x8vdgqj x2b8uid">...</div>
  (Session info: chrome=127.0.6533.89)
Stacktrace:
	GetHandleVerifier [0x00007FF61F2C9632+30946]
	(No symbol) [0x00007FF61F27E3C9]
	(No symbol) [0x00007FF61F176FDA]
	(No symbol) [0x00007FF61F1CFEEE]
	(No symbol) [0x00007FF61F1CD962]
	(No symbol) [0x00007FF61F1CAE7B]
	(No symbol) [0x00007FF61F1CA095]
	(No symbol) [0x00007FF61F1BC271]
	(No symbol) [0x00007FF61F1ECA6A]
	(No symbol) [0x00007FF61F1BBBB6]
	(No symbol) [0x00007FF61F1ECC80]
	(No symbol) [0x00007FF61F20B041]
	(No symbol) [0x00007FF61F1EC813]
	(No symbol) [0x00007FF61F1BA6E5]
	(No symbol) [0x00007FF61F1B

Printing Raw Listings Data

In [44]:
print(listings_dict)

{'https://www.facebook.com/marketplace/istanbul/search?query=fiat%20egea&minPrice=100000&maxPrice=2000000&exactPrice=&exact=false&sortBy=price_descend': [{'listing_html': <div class="x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24" style="max-width: 281.94px; min-width: 179.08px;"><div class="xjp7ctv"><div><span class="x1lliihq x1iyjqo2"><div><div class="x78zum5 xdt5ytf" data-virtualized="false"><div class="x9f619 x1n2onr6 x1ja2u2z"><div class="x3ct3a4"><a class="x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1sur9pj xkrqix3 x1lku1pv" href="/marketplace/item/1137735264188494/?ref=search&amp;referral_code=null&amp;referral_story_type=post&amp;__tn__=!%3AD" role="link" tabindex="0"><div class="x78zum5 xdt5ytf x1n2onr6"><div class="x1n2onr6"><div clas

Extracting Features From Raw Data

In [45]:
def extract_features(listings: dict) -> pd.DataFrame:
    data = []
    for url, listings in listings.items():
        for listing in listings:
            price_item = listing['listing_html'].find('span', {'class': features_classes.get('price')})
            model_item = listing['listing_html'].find('span', {'class': features_classes.get('model')})
            city_item = listing['listing_html'].find('span', {'class': features_classes.get('city')})

            price = price_item.text.strip() if price_item else np.nan
            model = model_item.text.strip() if model_item else np.nan
            city = city_item.text.strip() if city_item else np.nan
            data.append([url, price, model, city, listing['highlighted_details'], listing['details']])
    return pd.DataFrame(data, columns=['URL', 'Price', 'Model', 'City', 'Highlighted Details', 'Details'])

df = extract_features(listings_dict)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 2)

print(df)


                           URL                             Price                           Model                                    City             Highlighted Details                      Details                      
0    https://www.facebook.com/marketplace/istanbul/...  TRY1,300,000  2020..1.5.dizel 93.250 km otomatik titanyum pa...             Istanbul, Turkey        Used - Good   to 2020..1.5.dizel 93.250 km otomatik titanyum...
1    https://www.facebook.com/marketplace/istanbul/...  TRY1,245,000                         Stop Far Tampon Honda BOZ🎲             Istanbul, Turkey    Used - like new   to 2020..1.5.dizel 93.250 km otomatik titanyum...
2    https://www.facebook.com/marketplace/istanbul/...  TRY1,200,080  2023 MODEL EGEA 1.6 DİZEL OTOMATİK URBAN PLUS ...     Derince, Kocaeli, Turkey                NaN   to 2020..1.5.dizel 93.250 km otomatik titanyum...
3    https://www.facebook.com/marketplace/istanbul/...  TRY1,200,000  2023 MODEL EGEA 1.6 DİZEL OTOMATİK URBAN PLUS ... 

Removing Incomplete Data - 