In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set up paths - when running from notebooks/ directory
current_dir = Path().resolve()
# If we're in notebooks/, go up one level to project root
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir
data_dir = project_root / "data"

# City folders to process
cities = ["NYC", "Boston", "Washington_DC"]

print(f"Current directory: {current_dir}")
print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"\nProcessing listings for cities: {', '.join(cities)}")


Current directory: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/notebooks
Project root: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor
Data directory: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data

Processing listings for cities: NYC, Boston, Washington_DC


In [2]:
# Process each city's listings
all_cleaned_data = []

for city in cities:
    city_dir = data_dir / city
    listings_path = city_dir / "listings.csv"
    
    if not listings_path.exists():
        print(f"⚠ Skipping {city}: {listings_path} not found")
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing {city}...")
    print(f"{'='*60}")
    
    # Read the listings CSV
    df = pd.read_csv(listings_path)
    print(f"Original shape: {df.shape}")
    
    # Clean the price column
    df_clean = df.copy()
    
    # Remove $ and commas from price, convert to numeric
    price_str = (
        df_clean["price"]
          .astype(str)
          .str.replace("$", "", regex=False)
          .str.replace(",", "", regex=False)
          .str.strip()
    )
    
    df_clean["price"] = pd.to_numeric(price_str, errors="coerce")
    
    # Remove rows with missing or invalid prices
    original_count = len(df_clean)
    df_clean = df_clean[df_clean["price"].notna()]
    df_clean = df_clean[df_clean["price"] > 0]
    
    print(f"Cleaned shape: {df_clean.shape}")
    print(f"Removed {original_count - len(df_clean)} rows with invalid prices")
    
    # Store cleaned data with city identifier
    all_cleaned_data.append((city, df_clean))

print(f"\n{'='*60}")
print(f"Summary: Processed {len(all_cleaned_data)} cities")
print(f"{'='*60}")



Processing NYC...
Original shape: (36111, 79)
Cleaned shape: (21328, 79)
Removed 14783 rows with invalid prices

Processing Boston...
Original shape: (4419, 79)
Cleaned shape: (3506, 79)
Removed 913 rows with invalid prices

Processing Washington_DC...
Original shape: (6423, 79)
Cleaned shape: (4846, 79)
Removed 1577 rows with invalid prices

Summary: Processed 3 cities


In [3]:
# Save cleaned CSVs for each city
processed_dir = data_dir / "processed"
processed_dir.mkdir(exist_ok=True)

saved_files = []

for city, df_clean in all_cleaned_data:
    # Save to processed folder with city name
    out_path = processed_dir / f"{city}_listings_cleaned.csv"
    df_clean.to_csv(out_path, index=False)
    saved_files.append(str(out_path))
    print(f"✓ {city}: Saved {len(df_clean)} listings to {out_path}")

print(f"\n✓ All cleaned data saved!")
print(f"\nSaved files:")
for f in saved_files:
    print(f"  - {f}")


✓ NYC: Saved 21328 listings to /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/NYC_listings_cleaned.csv
✓ Boston: Saved 3506 listings to /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/Boston_listings_cleaned.csv
✓ Washington_DC: Saved 4846 listings to /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/Washington_DC_listings_cleaned.csv

✓ All cleaned data saved!

Saved files:
  - /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/NYC_listings_cleaned.csv
  - /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/Boston_listings_cleaned.csv
  - /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/Washington_DC_listings_cleaned.csv
