In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
vehicles = pd.read_csv('./data/vehicles.csv')
print(f"Total records: {len(vehicles):,}")
print(f"Price range: ${vehicles['price'].min():,.2f} to ${vehicles['price'].max():,.2f}")


In [None]:
# Create price distribution with 1000 bins
n_bins = 1000

# Calculate histogram
counts, bin_edges = np.histogram(vehicles['price'], bins=n_bins)

# Create a dataframe for easy viewing
price_dist = pd.DataFrame({
    'Range_Start': bin_edges[:-1],
    'Range_End': bin_edges[1:],
    'Count': counts
})

# Add percentage column
price_dist['Percentage'] = (price_dist['Count'] / len(vehicles) * 100).round(2)

# Add cumulative percentage
price_dist['Cumulative_Pct'] = price_dist['Percentage'].cumsum().round(2)

# Format for display
price_dist['Range'] = price_dist.apply(
    lambda row: f"${row['Range_Start']:,.2f} - ${row['Range_End']:,.2f}", axis=1
)

print("Price Distribution Summary (1000 bins)")
print("=" * 80)
print(f"Total bins: {n_bins}")
print(f"Non-empty bins: {(counts > 0).sum()}")
print(f"Bin width: ${(bin_edges[1] - bin_edges[0]):,.2f}")
print("\nFirst 20 ranges with vehicles:")
print(price_dist[price_dist['Count'] > 0][['Range', 'Count', 'Percentage', 'Cumulative_Pct']].head(20).to_string(index=False))


In [None]:
# Show bins with most vehicles
print("\nTop 20 Price Ranges (by count):")
print("=" * 80)
top_ranges = price_dist.nlargest(20, 'Count')[['Range', 'Count', 'Percentage', 'Cumulative_Pct']]
print(top_ranges.to_string(index=False))


In [None]:
# Visualize the full distribution
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Plot 1: Full distribution with 1000 bins
ax1 = axes[0]
ax1.hist(vehicles['price'], bins=n_bins, color='steelblue', alpha=0.7, edgecolor='black', linewidth=0.1)
ax1.set_xlabel('Price ($)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax1.set_title(f'Price Distribution ({n_bins} bins) - Full Range', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3, axis='y')
ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

# Add statistics
mean_price = vehicles['price'].mean()
median_price = vehicles['price'].median()
ax1.axvline(mean_price, color='red', linestyle='--', linewidth=2, label=f'Mean: ${mean_price:,.0f}')
ax1.axvline(median_price, color='green', linestyle='--', linewidth=2, label=f'Median: ${median_price:,.0f}')
ax1.legend(fontsize=10)

# Plot 2: Zoomed in (up to 99th percentile) for better detail
ax2 = axes[1]
price_99 = vehicles['price'].quantile(0.99)
price_filtered = vehicles[vehicles['price'] <= price_99]['price']

ax2.hist(price_filtered, bins=n_bins, color='lightcoral', alpha=0.7, edgecolor='black', linewidth=0.1)
ax2.set_xlabel('Price ($)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Frequency', fontsize=12, fontweight='bold')
ax2.set_title(f'Price Distribution ({n_bins} bins) - Up to 99th Percentile (${price_99:,.0f})', 
              fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')
ax2.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

# Add statistics for filtered data
ax2.axvline(price_filtered.mean(), color='darkred', linestyle='--', linewidth=2, 
            label=f'Mean: ${price_filtered.mean():,.0f}')
ax2.axvline(price_filtered.median(), color='darkgreen', linestyle='--', linewidth=2, 
            label=f'Median: ${price_filtered.median():,.0f}')
ax2.legend(fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# Summary statistics by price bands
print("\n" + "=" * 80)
print("SUMMARY STATISTICS BY PRICE RANGE")
print("=" * 80)

# Create broader price bands for summary
price_bands = [
    (0, 5000, "Under $5k"),
    (5000, 10000, "$5k - $10k"),
    (10000, 15000, "$10k - $15k"),
    (15000, 20000, "$15k - $20k"),
    (20000, 30000, "$20k - $30k"),
    (30000, 50000, "$30k - $50k"),
    (50000, 100000, "$50k - $100k"),
    (100000, float('inf'), "Over $100k")
]

summary_data = []
for low, high, label in price_bands:
    count = ((vehicles['price'] >= low) & (vehicles['price'] < high)).sum()
    pct = count / len(vehicles) * 100
    summary_data.append({
        'Range': label,
        'Count': f"{count:,}",
        'Percentage': f"{pct:.2f}%"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + summary_df.to_string(index=False))
print("\n" + "=" * 80)


In [None]:
# Access the full distribution dataframe
# price_dist contains all 1000 bins with columns:
# - Range_Start: Start of price range
# - Range_End: End of price range  
# - Count: Number of vehicles in this range
# - Percentage: Percentage of total vehicles
# - Cumulative_Pct: Cumulative percentage
# - Range: Formatted string of the range

print(f"\nFull distribution dataframe 'price_dist' is available with {len(price_dist)} rows")
print(f"Shape: {price_dist.shape}")
print(f"\nColumns: {list(price_dist.columns)}")
print("\nExample: View bins 100-110")
print(price_dist.iloc[100:110][['Range', 'Count', 'Percentage']].to_string())

# You can also export to CSV for Excel/other analysis:
# price_dist.to_csv('price_distribution_1000_bins.csv', index=False)
