In [None]:
import pandas as pd
import numpy as np
import os

# Read the raw listings CSV
df = pd.read_csv("../data/listings.csv")
print(f"Original shape: {df.shape}")
df.head()


In [None]:
# Clean the price column
df_clean = df.copy()

# Remove $ and commas from price, convert to numeric
price_str = (
    df_clean["price"]
      .astype(str)
      .str.replace("$", "", regex=False)
      .str.replace(",", "", regex=False)
      .str.strip()
)

df_clean["price"] = pd.to_numeric(price_str, errors="coerce")

# Remove rows with missing or invalid prices
df_clean = df_clean[df_clean["price"].notna()]
df_clean = df_clean[df_clean["price"] > 0]

print(f"Cleaned shape: {df_clean.shape}")
print(f"Removed {df.shape[0] - df_clean.shape[0]} rows with invalid prices")
df_clean.head()


In [None]:
# Save cleaned CSV
os.makedirs("../data/processed", exist_ok=True)

out_path = "../data/processed/listings_cleaned.csv"
df_clean.to_csv(out_path, index=False)

print(f"Cleaned data saved to: {out_path}")
out_path
