# 02 - Exploratory Data Analysis

Explore and validate scraped BaT data before modeling.

**Inputs:**
- `data/raw/bat_listings.parquet`

**Outputs:**
- `data/processed/cleaned_listings.parquet`

In [None]:
import logging
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from price_analysis.data import clean_listings
from price_analysis.data.cleaning import get_summary_stats, prepare_model_data

logging.basicConfig(level=logging.INFO)
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# Paths
DATA_DIR = Path("../data")
RAW_PATH = DATA_DIR / "raw" / "bat_listings.parquet"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH = PROCESSED_DIR / "cleaned_listings.parquet"

## Load Raw Data

In [None]:
df_raw = pd.read_parquet(RAW_PATH)
print(f"Raw data: {len(df_raw)} listings")
display(df_raw.head())

In [None]:
df_raw.info()

## Clean and Validate

In [None]:
df = clean_listings(df_raw, drop_invalid=False)
display(df.head())

In [None]:
stats = get_summary_stats(df)
print(f"Total listings: {stats['n_listings']}")
print(f"Valid listings: {stats['n_valid']}")
print(f"Date range: {stats['date_range'][0]} to {stats['date_range'][1]}")
print(f"Price range: ${stats['price_range'][0]:,} to ${stats['price_range'][1]:,}")
print(f"Median price: ${stats['price_median']:,}")
print(f"Median mileage: {stats['mileage_median']:,}")

## Missing Data Report

In [None]:
missing = df.isna().sum()
missing_pct = (missing / len(df) * 100).round(1)
missing_report = pd.DataFrame({"missing": missing, "pct": missing_pct})
display(missing_report[missing_report["missing"] > 0].sort_values("missing", ascending=False))

## Distribution Plots

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Price distribution
sns.histplot(df["sale_price"].dropna() / 1000, bins=50, ax=axes[0, 0])
axes[0, 0].set_xlabel("Sale Price ($k)")
axes[0, 0].set_title("Price Distribution")

# Log price distribution
sns.histplot(df["log_price"].dropna(), bins=50, ax=axes[0, 1])
axes[0, 1].set_xlabel("Log(Sale Price)")
axes[0, 1].set_title("Log Price Distribution (for modeling)")

# Mileage distribution
sns.histplot(df["mileage"].dropna() / 1000, bins=50, ax=axes[1, 0])
axes[1, 0].set_xlabel("Mileage (k miles)")
axes[1, 0].set_title("Mileage Distribution")

# Age distribution
sns.histplot(df["age"].dropna(), bins=30, ax=axes[1, 1])
axes[1, 1].set_xlabel("Age (years)")
axes[1, 1].set_title("Age Distribution")

plt.tight_layout()
plt.show()

## Categorical Counts

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Generation counts
gen_counts = df["generation"].value_counts()
sns.barplot(x=gen_counts.values, y=gen_counts.index, ax=axes[0], orient="h")
axes[0].set_title("Listings by Generation")
axes[0].set_xlabel("Count")

# Trim counts (top 10)
trim_counts = df["trim"].value_counts().head(10)
sns.barplot(x=trim_counts.values, y=trim_counts.index, ax=axes[1], orient="h")
axes[1].set_title("Listings by Trim (Top 10)")
axes[1].set_xlabel("Count")

# Transmission counts
trans_counts = df["transmission"].value_counts()
sns.barplot(x=trans_counts.values, y=trans_counts.index, ax=axes[2], orient="h")
axes[2].set_title("Listings by Transmission")
axes[2].set_xlabel("Count")

plt.tight_layout()
plt.show()

## Price Over Time

In [None]:
# Focus on key generations
key_gens = ["991.1", "991.2", "992.1", "997.1", "997.2"]
df_key = df[df["generation"].isin(key_gens)].copy()

fig, ax = plt.subplots(figsize=(14, 7))
for gen in key_gens:
    subset = df_key[df_key["generation"] == gen]
    if len(subset) > 5:
        monthly = subset.groupby(subset["sale_date"].dt.to_period("M"))["sale_price"].median()
        monthly.index = monthly.index.to_timestamp()
        ax.plot(
            monthly.index, monthly.values / 1000, label=gen, marker="o", markersize=3, alpha=0.7
        )

ax.set_xlabel("Sale Date")
ax.set_ylabel("Median Sale Price ($k)")
ax.set_title("Price Trends by Generation")
ax.legend()
plt.tight_layout()
plt.show()

## Price by Mileage

In [None]:
fig, ax = plt.subplots(figsize=(12, 7))
for gen in key_gens:
    subset = df_key[df_key["generation"] == gen]
    ax.scatter(
        subset["mileage"] / 1000,
        subset["sale_price"] / 1000,
        label=gen,
        alpha=0.5,
        s=20,
    )

ax.set_xlabel("Mileage (k miles)")
ax.set_ylabel("Sale Price ($k)")
ax.set_title("Price vs Mileage by Generation")
ax.legend()
plt.tight_layout()
plt.show()

## Manual vs PDK Premium

In [None]:
# Compare manual vs PDK prices by generation (controlling for trim)
trans_comparison = (
    df[df["transmission"].isin(["Manual", "PDK"])]
    .groupby(["generation", "trim", "transmission"])["sale_price"]
    .median()
    .unstack("transmission")
)
trans_comparison["manual_premium"] = trans_comparison["Manual"] - trans_comparison["PDK"]
trans_comparison["manual_premium_pct"] = (
    trans_comparison["manual_premium"] / trans_comparison["PDK"] * 100
).round(1)

display(trans_comparison.dropna().sort_values("manual_premium", ascending=False).head(15))

## Correlation Matrix

In [None]:
numeric_cols = ["sale_price", "log_price", "mileage", "age", "model_year", "sale_year"]
corr = df[numeric_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, ax=ax)
ax.set_title("Correlation Matrix")
plt.tight_layout()
plt.show()

## Prepare Model-Ready Data

In [None]:
df_model = prepare_model_data(df)
print(f"Model-ready data: {len(df_model)} listings")
display(df_model.head())

In [None]:
# Save cleaned data
df.to_parquet(OUTPUT_PATH, index=False)
print(f"Saved cleaned data to {OUTPUT_PATH}")