# 01 - BaT Scraping

Scrape completed Porsche 911 auctions from Bring a Trailer.

**Outputs:**
- `data/raw/bat_listings.parquet` - Raw scraped data

In [None]:
import logging
import pandas as pd
from pathlib import Path

from price_analysis.scraping import fetch_auctions
from price_analysis.scraping.bat import listings_to_dataframe

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Paths
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

OUTPUT_PATH = RAW_DIR / "bat_listings.parquet"

## Configure Scraping

Adjust search queries and pagination as needed.

In [None]:
# Search queries - can be refined for specific generations
QUERIES = [
    "Porsche 911",  # Broad search to get all 911s
]

# Scraping parameters
MAX_PAGES = 50  # Adjust based on how much data you want
DELAY = 2.5     # Seconds between requests (be polite!)
HEADLESS = True # Set False to see browser for debugging

## Run Scraper

This will take a while depending on MAX_PAGES. Each page + listing takes ~3-5 seconds.

In [None]:
all_listings = []

for query in QUERIES:
    logger.info(f"Scraping: {query}")
    listings = fetch_auctions(
        query=query,
        max_pages=MAX_PAGES,
        delay=DELAY,
        headless=HEADLESS,
    )
    all_listings.extend(listings)
    logger.info(f"Found {len(listings)} listings for '{query}'")

logger.info(f"Total listings scraped: {len(all_listings)}")

## Convert to DataFrame and Save

In [None]:
df = listings_to_dataframe(all_listings)
display(df.head(10))
print(f"\nShape: {df.shape}")

In [None]:
# Append to existing if present, otherwise create new
if OUTPUT_PATH.exists():
    existing = pd.read_parquet(OUTPUT_PATH)
    df = pd.concat([existing, df], ignore_index=True)
    df = df.drop_duplicates(subset=["listing_url"], keep="last")
    logger.info(f"Merged with existing data: {len(df)} total listings")

df.to_parquet(OUTPUT_PATH, index=False)
logger.info(f"Saved to {OUTPUT_PATH}")

## Quick Inspection

In [None]:
print("Counts by generation:")
display(df["generation"].value_counts())

In [None]:
print("Counts by trim:")
display(df["trim"].value_counts())

In [None]:
print("Counts by transmission:")
display(df["transmission"].value_counts())

In [None]:
# Check parsing quality - how many have all required fields?
required = ["sale_price", "model_year", "generation", "trim", "transmission", "mileage"]
complete = df[required].notna().all(axis=1).sum()
print(f"\nListings with all required fields: {complete} / {len(df)} ({complete/len(df)*100:.1f}%)")

In [None]:
# Sample some listings to verify parsing
print("Sample listings for manual verification:")
sample = df.sample(min(5, len(df)))
for _, row in sample.iterrows():
    print(f"\n{row['title_raw']}")
    print(f"  Parsed: {row['model_year']} {row['generation']} {row['trim']} ({row['transmission']})")
    print(f"  Price: ${row['sale_price']:,}" if pd.notna(row['sale_price']) else "  Price: N/A")
    print(f"  Mileage: {row['mileage']:,}" if pd.notna(row['mileage']) else "  Mileage: N/A")