# 01 - BaT Scraping

Scrape completed Porsche 911 auctions from Bring a Trailer.

**Outputs:**
- `data/raw/bat_listings.parquet` - Raw scraped data

In [1]:
# delete bad data if needed
# !rm data/raw/bat_listings.parquet

In [2]:
import logging
import pandas as pd
from pathlib import Path

from price_analysis.scraping import fetch_auctions, validate_scraped_data, DataQualityError
from price_analysis.scraping.bat import listings_to_dataframe

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# Paths
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

OUTPUT_PATH = RAW_DIR / "bat_listings.parquet"

## Configure Scraping

Adjust search queries and pagination as needed.

In [4]:
# Search queries - multiple generation-specific searches to get more results
# BaT seems to limit how many results a single search returns
QUERIES = [
    # "Porsche 996",  # 1999-2004 (996.1 and 996.2)
    # "Porsche 997",  # 2005-2012 (997.1 and 997.2)
    "Porsche 991",  # 2012-2019 (991.1 and 991.2)
    "Porsche 992",  # 2020+ (992.1 and 992.2)
    "Porsche 911",  # Catch air-cooled + anything else missed
]

# Scraping parameters
MAX_CLICKS = 25  # Number of "Show More" button clicks (each loads ~24 listings)
DELAY = 1.0  # Seconds between requests (be polite!)
HEADLESS = True  # Set False to see browser for debugging

## Debug: Inspect BaT Page Structure (Optional)

Skip this section - fixture files already saved to `tests/fixtures/`.
Only re-run if BaT changes their DOM structure.

In [5]:
# Debug: Save sample pages for selector inspection
import time
from price_analysis.scraping import create_driver, save_debug_page

# Create driver (set HEADLESS=False above to watch)
driver = create_driver(headless=HEADLESS)

try:
    # 1. Fetch search results page
    search_url = "https://bringatrailer.com/auctions/results/?s=Porsche+911"
    logger.info(f"Fetching search page: {search_url}")
    driver.get(search_url)
    time.sleep(3)  # Wait for JS to load
    save_debug_page(driver, "bat_search_page", output_dir="../tests/fixtures")

    # 2. Fetch a specific listing page (992.1)
    listing_url = "https://bringatrailer.com/listing/2020-porsche-911-carrera-4s-coupe-26/"
    logger.info(f"Fetching listing: {listing_url}")
    driver.get(listing_url)
    time.sleep(3)
    save_debug_page(driver, "bat_listing_992", output_dir="../tests/fixtures")

    # 3. Fetch another listing (997.2) for variety
    listing_url_997 = "https://bringatrailer.com/listing/2009-porsche-911-carrera-4s-coupe-28/"
    logger.info(f"Fetching listing: {listing_url_997}")
    driver.get(listing_url_997)
    time.sleep(3)
    save_debug_page(driver, "bat_listing_997", output_dir="../tests/fixtures")

    logger.info("Debug pages saved to tests/fixtures/")

finally:
    driver.quit()

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/flatljan/.wdm/drivers/chromedriver/mac64/143.0.7499.169/chromedriver-mac-arm64/chromedriver] found in cache
INFO:__main__:Fetching search page: https://bringatrailer.com/auctions/results/?s=Porsche+911
INFO:price_analysis.scraping.bat:Saved debug page to ../tests/fixtures/bat_search_page.html
INFO:__main__:Fetching listing: https://bringatrailer.com/listing/2020-porsche-911-carrera-4s-coupe-26/
INFO:price_analysis.scraping.bat:Saved debug page to ../tests/fixtures/bat_listing_992.html
INFO:__main__:Fetching listing: https://bringatrailer.com/listing/2009-porsche-911-carrera-4s-coupe-28/
INFO:price_analysis.scraping.bat:Saved debug page to ../tests/fixtures/bat_listing_997.html
INFO:__main__:Debug pages saved to tests/fixtures/


## Run Scraper

This will take a while depending on MAX_PAGES. Each page + listing takes ~3-5 seconds.

In [6]:
# Load existing URLs for incremental scraping (skip already-fetched listings)
existing_urls: set[str] = set()
if OUTPUT_PATH.exists():
    existing_df = pd.read_parquet(OUTPUT_PATH)
    existing_urls = set(existing_df["listing_url"])
    logger.info(f"Loaded {len(existing_urls)} existing URLs - will skip these")

all_listings = []

for query in QUERIES:
    logger.info(f"Scraping: {query}")
    listings = fetch_auctions(
        query=query,
        max_clicks=MAX_CLICKS,
        delay=DELAY,
        headless=HEADLESS,
        existing_urls=existing_urls,
    )
    all_listings.extend(listings)
    logger.info(f"Found {len(listings)} NEW listings for '{query}'")

logger.info(f"Total NEW listings scraped: {len(all_listings)}")

INFO:__main__:Loaded 1053 existing URLs - will skip these
INFO:__main__:Scraping: Porsche 991
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/flatljan/.wdm/drivers/chromedriver/mac64/143.0.7499.169/chromedriver-mac-arm64/chromedriver] found in cache
INFO:price_analysis.scraping.bat:Fetching search results: https://bringatrailer.com/auctions/results/?s=Porsche+991
INFO:price_analysis.scraping.bat:Clicked 'Show More' (1/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (2/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (3/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (4/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (5/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (6/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (7/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (8/25)
INFO:price_analysis.scraping.bat:Clicked 'Show More' (9/25)


## Convert to DataFrame and Save

In [7]:
df = listings_to_dataframe(all_listings)

# Run data quality checks and filter low-price listings (<$10k)
# Returns filtered DataFrame, raises DataQualityError if checks fail
df = validate_scraped_data(df)

display(df.head(10))
print(f"\nShape: {df.shape}")

INFO:price_analysis.scraping.bat:Filtered 95 listings below $10,000 (likely parts/salvage)
INFO:price_analysis.scraping.bat:Data quality checks passed: 506 listings, 506 with prices


Unnamed: 0,listing_url,title_raw,sale_price,sale_date,model_year,generation,trim,transmission,mileage,color,location
0,https://bringatrailer.com/listing/2012-porsche...,5k-Mile 2012 Porsche 911 Carrera Coupe,69000,2025-03-10,2012.0,997.2,Carrera,Automatic,5000.0,Platinum Silver Metallic,"Fort Lauderdale, Florida 33312"
1,https://bringatrailer.com/listing/2019-porsche...,"4,400-Mile 2019 Porsche 911 Turbo S Cabriolet",160000,2025-02-11,2019.0,991.2,Turbo S,PDK,4400.0,Black,"Hasbrouck Heights, New Jersey 07604"
2,https://bringatrailer.com/listing/2012-porsche...,24k-Mile 2012 Porsche 911 Carrera S Coupe,75000,2025-09-02,2012.0,997.2,Carrera S,PDK,24000.0,Agate Gray Metallic,"Pompano Beach, Florida 33064"
3,https://bringatrailer.com/listing/2014-porsche...,2014 Porsche 911 Carrera 4S Coupe,59500,2024-12-29,2014.0,991.1,Carrera 4S,PDK,81000.0,Black,"Carlsbad, California 92009"
4,https://bringatrailer.com/listing/2014-porsche...,2014 Porsche 911 GT3,113000,2025-02-13,2014.0,991.1,GT3,PDK,,GT Silver Metallic,"Bedford, New Hampshire 03110"
5,https://bringatrailer.com/listing/2019-porsche...,8k-Mile 2019 Porsche 911 Carrera GTS Coupe 7-S...,147000,2025-08-19,2019.0,991.2,Carrera,Manual,8000.0,Miami Blue,"Las Vegas, Nevada 89135"
6,https://bringatrailer.com/listing/2018-porsche...,823-Mile 2018 Porsche 911 Turbo S Coupe Exclus...,350000,2025-04-29,2018.0,991.2,Turbo S,PDK,823.0,Carrara White Metallic,"Newbury Park, California 91320"
7,https://bringatrailer.com/listing/2018-porsche...,16k-Mile 2018 Porsche 911 Carrera T,82500,2024-12-23,2018.0,991.2,Carrera,PDK,16000.0,GT Silver Metallic,"Phoenix, Arizona 85018"
8,https://bringatrailer.com/listing/2019-porsche...,"1,300-Mile 2019 Porsche 911 Turbo S Coupe",213000,2025-07-07,2019.0,991.2,Turbo S,PDK,1300.0,Black,"Carmel, California 93923"
9,https://bringatrailer.com/listing/2019-porsche...,2019 Porsche 911 GT3 RS Weissach,229000,2025-05-15,2019.0,991.2,GT3 RS,PDK,17000.0,,"Meadow Vista, California 95722"



Shape: (506, 11)


In [8]:
# Append to existing if present, otherwise create new
if OUTPUT_PATH.exists():
    existing = pd.read_parquet(OUTPUT_PATH)
    df = pd.concat([existing, df], ignore_index=True)
    df = df.drop_duplicates(subset=["listing_url"], keep="last")
    logger.info(f"Merged with existing data: {len(df)} total listings")

df.to_parquet(OUTPUT_PATH, index=False)
logger.info(f"Saved to {OUTPUT_PATH}")

INFO:__main__:Merged with existing data: 1559 total listings
INFO:__main__:Saved to ../data/raw/bat_listings.parquet


## Quick Inspection

In [9]:
print("Counts by generation:")
display(df["generation"].value_counts(dropna=False))

Counts by generation:


generation
997.1    273
992.1    223
991.2    196
996.1    186
996.2    169
997.2    156
None     152
991.1    115
992.2     89
Name: count, dtype: int64

In [10]:
print("Counts by trim:")
display(df["trim"].value_counts(dropna=False))

Counts by trim:


trim
Carrera        366
Carrera S      193
Turbo          184
Carrera 4S     165
None           127
Turbo S        103
GT3             97
Carrera 4       79
GT3 RS          70
GT3 Touring     61
Targa 4S        36
Targa           34
Targa 4         29
GT2 RS          15
Name: count, dtype: int64

In [11]:
print("Counts by transmission:")
display(df["transmission"].value_counts(dropna=False))

Counts by transmission:


transmission
Manual       924
PDK          481
None          58
Tiptronic     51
Automatic     45
Name: count, dtype: int64

In [12]:
# Check parsing quality - how many have all required fields?
required = ["sale_price", "model_year", "generation", "trim", "transmission", "mileage"]
complete = df[required].notna().all(axis=1).sum()
print(
    f"\nListings with all required fields: {complete} / {len(df)} ({complete / len(df) * 100:.1f}%)"
)


Listings with all required fields: 1324 / 1559 (84.9%)


In [13]:
# Sample some listings to verify parsing
print("Sample listings for manual verification:")
sample = df.sample(min(10, len(df)), random_state=42)
for _, row in sample.iterrows():
    print(f"\n{row['title_raw']}")
    print(
        f"  Parsed: {row['model_year']} {row['generation']} {row['trim']} ({row['transmission']})"
    )
    print(f"  Price: ${row['sale_price']:,}" if pd.notna(row["sale_price"]) else "  Price: N/A")
    print(f"  Mileage: {row['mileage']:,}" if pd.notna(row["mileage"]) else "  Mileage: N/A")

Sample listings for manual verification:

RSR-Style, Twin-Plug-Powered 1970 Porsche 911E Coupe 5-Speed
  Parsed: 1970.0 None None (Manual)
  Price: $100,911
  Mileage: 17,000.0

2023 Porsche 911 GT3 RS
  Parsed: 2023.0 992.1 GT3 RS (PDK)
  Price: $349,999
  Mileage: 26.0

2002 Porsche 911 Carrera Cabriolet
  Parsed: 2002.0 996.2 Carrera (Automatic)
  Price: $27,250
  Mileage: 57,000.0

25k-Mile 2007 Porsche 911 Carrera S Coupe 6-Speed
  Parsed: 2007.0 997.1 Carrera S (Manual)
  Price: $72,500
  Mileage: 25,000.0

2020 Porsche 911 Carrera 4S Coupe
  Parsed: 2020.0 992.1 Carrera 4S (PDK)
  Price: $121,000
  Mileage: 10,000.0

25-Years-Owned 1999 Porsche 911 Carrera Coupe 6-Speed
  Parsed: 1999.0 996.1 Carrera (Manual)
  Price: $27,750
  Mileage: 76,000.0

32k-Mile 2003 Porsche 911 Carrera 4S Coupe 6-Speed
  Parsed: 2003.0 996.2 Carrera 4S (Manual)
  Price: $37,750
  Mileage: 32,000.0

2013 Porsche 911 Turbo S Coupe
  Parsed: 2013.0 991.1 Turbo S (PDK)
  Price: $84,000
  Mileage: 73,000.0